From a4d4124e6d87eb03b09bd304cd4af5c81e7db2f9 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Fri, 16 Oct 2020 07:56:54 -0700 Subject: [PATCH 01/59] Use libkineto in profiler Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: python test/test_profiler.py [ghstack-poisoned] --- test/test_profiler.py | 20 ++++ torch/autograd/__init__.py | 6 +- torch/autograd/profiler.py | 65 ++++++++++--- torch/csrc/autograd/init.cpp | 14 ++- torch/csrc/autograd/profiler.cpp | 151 +++++++++++++++++++++++++++---- torch/csrc/autograd/profiler.h | 40 +++++++- 6 files changed, 256 insertions(+), 40 deletions(-) diff --git a/test/test_profiler.py b/test/test_profiler.py index f1feff1d0af3..44973546429e 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -99,6 +99,26 @@ def forward(self, x): torch._C._set_graph_executor_optimize(prev_opt) + @unittest.skipIf(not torch.autograd.kineto_available(), "Kineto is required") + @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required") + def test_kineto(self): + x = torch.randn(10, 10).cuda() + y = torch.randn(10, 10).cuda() + with profile(use_cuda=True, use_kineto=True) as p: + z = torch.mm(x, y) + z = z + y + z = z.cpu() + print(p.key_averages().table( + sort_by="self_cuda_time_total", row_limit=-1)) + found_gemm = False + found_memcpy = False + for e in p.function_events: + if "gemm" in e.name: + found_gemm = True + if "Memcpy" in e.name or "memcpy" in e.name: + found_memcpy = True + self.assertTrue(found_gemm) + self.assertTrue(found_memcpy) if __name__ == '__main__': run_tests() diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py index 4e44536d931c..cec103ea4c8c 100644 --- a/torch/autograd/__init__.py +++ b/torch/autograd/__init__.py @@ -242,6 +242,6 @@ def variable(*args, **kwargs): raise RuntimeError("autograd initialization failed") # Import all native method/classes -from torch._C._autograd import (ProfilerState, ProfilerConfig, ProfilerEvent, - _enable_profiler, _disable_profiler, _profiler_enabled, - _enable_record_function, _set_empty_test_observer) +from torch._C._autograd import (ProfilerActivity, ProfilerState, ProfilerConfig, ProfilerEvent, + _prepare_profiler, _enable_profiler, _disable_profiler, _profiler_enabled, + _enable_record_function, _set_empty_test_observer, kineto_available) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index eba7368cb03e..c4d23f9efeb4 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -364,16 +364,47 @@ def __init__( use_cuda=False, record_shapes=False, profile_memory=False, - with_stack=False): + with_stack=False, + use_kineto=False): self.enabled = enabled - self.use_cuda = use_cuda - self.function_events = None if not self.enabled: return + self.use_cuda = use_cuda + self.function_events = None self.entered = False self.record_shapes = record_shapes self.profile_memory = profile_memory self.with_stack = with_stack + self.use_kineto = use_kineto + + self.profiler_kind = None + self.kineto_activities = [] + if self.use_kineto: + if self.use_cuda: + self.profiler_kind = torch.autograd.ProfilerState.KINETO + self.kineto_activities = [ + torch.autograd.ProfilerActivity.CPU, + # uses CUPTI + torch.autograd.ProfilerActivity.CUDA_RUNTIME, + torch.autograd.ProfilerActivity.CUDA] + else: + # intially we're not using Kineto for CPU only case + self.profiler_kind = torch.autograd.ProfilerState.CPU + elif self.use_cuda: + # legacy CUDA mode + self.profiler_kind = torch.autograd.ProfilerState.CUDA + else: + self.profiler_kind = torch.autograd.ProfilerState.CPU + self.kineto_activities = set(self.kineto_activities) + + if self.profiler_kind == torch.autograd.ProfilerState.KINETO: + assert torch.autograd.kineto_available() + + self.config = torch.autograd.ProfilerConfig( + self.profiler_kind, + self.record_shapes, + self.profile_memory, + self.with_stack) def __enter__(self): if not self.enabled: @@ -381,15 +412,8 @@ def __enter__(self): if self.entered: raise RuntimeError("autograd profiler traces are not reentrant") self.entered = True - profiler_kind = torch.autograd.ProfilerState.CUDA if self.use_cuda \ - else torch.autograd.ProfilerState.CPU - - config = torch.autograd.ProfilerConfig( - profiler_kind, - self.record_shapes, - self.profile_memory, - self.with_stack) - torch.autograd._enable_profiler(config) + torch.autograd._prepare_profiler(self.config, self.kineto_activities) + torch.autograd._enable_profiler(self.config) return self def __exit__(self, exc_type, exc_val, exc_tb): @@ -732,7 +756,7 @@ class FunctionEvent(FormattedTimesMixin): def __init__( self, id, node_id, name, thread, cpu_start, cpu_end, fwd_thread=None, input_shapes=None, stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, - is_remote=True, sequence_nr=-1): + is_remote=True, sequence_nr=-1, device_id=-1): self.id: int = id self.node_id: int = node_id self.name: str = name @@ -751,6 +775,7 @@ def __init__( self.is_async: bool = is_async self.is_remote: bool = is_remote self.sequence_nr: int = sequence_nr + self.device_id: int = device_id def append_kernel(self, name, device, start, end): self.kernels.append(Kernel(name, device, Interval(start, end))) @@ -802,15 +827,21 @@ def self_cpu_time_total(self): @property def cuda_time_total(self): + if self.device_id >= 0: + return self.cpu_interval.elapsed_us() return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) @property def self_cuda_time_total(self): + if self.device_id >= 0: + return self.cuda_time_total - sum([child.cuda_time_total for child in self.cpu_children]) return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) - \ sum([child.cuda_time_total for child in self.cpu_children]) @property def cpu_time_total(self): + if self.device_id >= 0: + return 0 return self.cpu_interval.elapsed_us() @property @@ -1045,6 +1076,7 @@ def adjusted_time(cuda_record, cuda_records_map): is_async=is_async, is_remote=is_remote_event, sequence_nr=start.sequence_nr(), + device_id=start.device_id(), ) # note: async events have only cpu total time if not is_async and start.has_cuda(): @@ -1180,7 +1212,9 @@ def build_table( has_input_shapes = any( [(event.input_shapes is not None and len(event.input_shapes) > 0) for event in events]) + MAX_NAME_COLUMN_WIDTH = 55 name_column_width = max([len(evt.key) for evt in events]) + 4 + name_column_width = min(name_column_width, MAX_NAME_COLUMN_WIDTH) DEFAULT_COLUMN_WIDTH = 12 @@ -1288,8 +1322,11 @@ def append(s): continue else: event_limit += 1 + name = evt.key + if len(name) >= MAX_NAME_COLUMN_WIDTH-3: + name = name[:(MAX_NAME_COLUMN_WIDTH-3)] + "..." row_values = [ - evt.key, # Name + name, # Self CPU total, 0 for async events. % format_time_share(evt.self_cpu_time_total, self_cpu_time_total), diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 045a732a2016..698931911878 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -39,7 +39,13 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .value("Disabled", ProfilerState::Disabled) .value("CPU", ProfilerState::CPU) .value("CUDA", ProfilerState::CUDA) - .value("NVTX", ProfilerState::NVTX); + .value("NVTX", ProfilerState::NVTX) + .value("KINETO", ProfilerState::KINETO); + + py::enum_(m, "ProfilerActivity") + .value("CPU", ActivityType::CPU) + .value("CUDA_RUNTIME", ActivityType::CUDA_RUNTIME) + .value("CUDA", ActivityType::CUDA); py::class_(m, "ProfilerConfig") .def(py::init()); @@ -61,11 +67,15 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .def("is_remote", &Event::isRemote) .def("sequence_nr", &Event::sequenceNr) .def("stack", &Event::stack) - .def("scope", &Event::scope); + .def("scope", &Event::scope) + .def("device_id", &Event::device); py::class_(m, "_ProfilerDisableOptions") .def(py::init()); + m.def("kineto_available", kinetoAvailable); + + m.def("_prepare_profiler", prepareProfiler); m.def("_enable_profiler", enableProfiler); m.def( "_disable_profiler", diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 5cbb7606e579..0b6203e695fe 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -21,6 +21,10 @@ #include +#ifdef USE_KINETO +#include "libkineto.h" +#endif + namespace torch { namespace autograd { namespace profiler { namespace { @@ -48,23 +52,23 @@ enum ProfilerIValueIdx { NUM_PROFILER_CFG_IVALUE_IDX // must be last in list }; - const std::unordered_set disable_cuda_profiling = { - "aten::view", - "aten::t", - "aten::transpose", - "aten::stride", - "aten::empty", - "aten::empty_like", - "aten::empty_strided", - "aten::as_strided", - "aten::expand", - "aten::resize_", - "aten::squeeze", - "aten::unsqueeze", - "aten::slice", - "aten::_unsafe_view", - "aten::size" - }; +const std::unordered_set disable_cuda_profiling = { + "aten::view", + "aten::t", + "aten::transpose", + "aten::stride", + "aten::empty", + "aten::empty_like", + "aten::empty_strided", + "aten::as_strided", + "aten::expand", + "aten::resize_", + "aten::squeeze", + "aten::unsqueeze", + "aten::slice", + "aten::_unsafe_view", + "aten::size" +}; CUDAStubs default_stubs; constexpr CUDAStubs* default_stubs_addr = &default_stubs; @@ -169,6 +173,14 @@ struct FileLineFunc { std::string funcname; }; +static std::atomic corr_id_ {}; +size_t next_correlation_id() { + return corr_id_++; +} +size_t peek_correlation_id() { + return corr_id_; +} + // Profiler state struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { explicit ProfilerThreadLocalState(const ProfilerConfig& config) @@ -193,6 +205,12 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { std::make_move_iterator(remoteProfiledEvents_->begin()), std::make_move_iterator(remoteProfiledEvents_->end())); } + if (kinetoEvents_) { + result.insert( + result.end(), + std::make_move_iterator(kinetoEvents_->begin()), + std::make_move_iterator(kinetoEvents_->end())); + } return result; } @@ -224,6 +242,11 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { } } + void setKinetoEvents(std::vector>&& kinetoEvents) { + std::lock_guard guard(state_mutex_); + kinetoEvents_ = std::move(kinetoEvents); + } + void pushRange( const at::RecordFunction& fn, const bool record_cuda, @@ -247,6 +270,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { evt.setSequenceNr(fn.seqNr()); evt.setFwdThreadId(fn.forwardThreadId()); evt.setScope((uint8_t)fn.scope()); + evt.setCorrelationId(peek_correlation_id()); #ifndef C10_MOBILE // backward nodes source range corresponds to the forward node // TODO: consider using C++ stack trace @@ -409,6 +433,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled); at::CallbackHandle handle_ = 0; c10::optional>> remoteProfiledEvents_; + c10::optional>> kinetoEvents_; }; ProfilerThreadLocalState* getProfilerTLSState() { @@ -451,6 +476,11 @@ void pushProfilingCallbacks() { } else { state_ptr->pushRange(fn, record_cuda, msg); } +#ifdef USE_KINETO + if (state_ptr->config().state == ProfilerState::KINETO) { + libkineto::api().pushCorrelationId(next_correlation_id()); + } +#endif }, [](const at::RecordFunction& fn) { auto state_ptr = getProfilerTLSState(); @@ -463,6 +493,11 @@ void pushProfilingCallbacks() { record_cuda = false; } state_ptr->popRange(fn, record_cuda); +#ifdef USE_KINETO + if (state_ptr->config().state == ProfilerState::KINETO) { + libkineto::api().popCorrelationId(); + } +#endif }) .needsInputs(state_ptr->config().report_input_shapes) .needsIds(true)); @@ -519,10 +554,48 @@ bool profilerEnabled() { return state_ptr && state_ptr->config().state != ProfilerState::Disabled; } +bool kinetoAvailable() { +#ifdef USE_KINETO + return true; +#else + return false; +#endif +} + +void prepareProfiler( + const ProfilerConfig& new_config, + const std::set& activities) { +#ifdef USE_KINETO + if (new_config.state == ProfilerState::KINETO) { + std::set k_activities; + if (activities.count(ActivityType::CPU)) { + k_activities.insert(libkineto::ActivityType::EXTERNAL_CORRELATION); + } + if (activities.count(ActivityType::CUDA_RUNTIME)) { + k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME); + } + if (activities.count(ActivityType::CUDA)) { + k_activities.insert(libkineto::ActivityType::GPU_MEMCPY); + k_activities.insert(libkineto::ActivityType::GPU_MEMSET); + k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL); + } + + if (!libkineto::api().hasProfilerRegistered()) { + libkineto::api().registerProfiler( + std::make_unique(false)); + } + libkineto::api().initProfilerIfRegistered(); + libkineto::api().prepareTrace(k_activities); + } +#endif +} + void enableProfiler(const ProfilerConfig& new_config) { TORCH_CHECK(new_config.state != ProfilerState::NVTX || cuda_stubs->enabled(), "Can't use NVTX profiler - PyTorch was compiled without CUDA"); + TORCH_CHECK(new_config.state != ProfilerState::KINETO || kinetoAvailable()); + auto state_ptr = getProfilerTLSState(); TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread"); auto state = std::make_shared(new_config); @@ -530,6 +603,12 @@ void enableProfiler(const ProfilerConfig& new_config) { pushProfilingCallbacks(); +#ifdef USE_KINETO + if (new_config.state == ProfilerState::KINETO) { + libkineto::api().startTrace(); + } +#endif + if (new_config.state == ProfilerState::CUDA) { // event recording appears to have some startup overhead, so we need to // to generate some dummy events first before recording synchronization events @@ -569,6 +648,44 @@ thread_event_lists disableProfiler(c10::optional profile at::removeCallback(state_ptr->callbackHandle()); } +#ifdef USE_KINETO + if (state_ptr->config().state == ProfilerState::KINETO) { + auto k_events = libkineto::api().stopTrace(); + std::unordered_map>> events; + for (auto& k_evt : k_events) { + auto& evt_list = events[k_evt.deviceId][k_evt.threadId]; + Event push_evt( + EventKind::PushRange, + at::StringView(k_evt.name), + k_evt.threadId, + false, + k_evt.correlationId); + push_evt.setDevice(k_evt.deviceId); + push_evt.setCpuUS(k_evt.startUs); + push_evt.setCorrelationId(k_evt.correlationId); + evt_list.emplace_back(std::move(push_evt)); + + Event pop_evt( + EventKind::PopRange, + at::StringView(k_evt.name), + k_evt.threadId, + false, + k_evt.correlationId); + pop_evt.setDevice(k_evt.deviceId); + pop_evt.setCpuUS(k_evt.endUs); + pop_evt.setCorrelationId(k_evt.correlationId); + evt_list.emplace_back(std::move(pop_evt)); + } + std::vector> events_list; + for (const auto& it : events) { + for (const auto& it2 : it.second) { + events_list.emplace_back(it2.second); + } + } + state_ptr->setKinetoEvents(std::move(events_list)); + } +#endif + if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) { return thread_event_lists(); } diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 9cfe9ea1fd6e..3bc6022b20fa 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -104,10 +104,19 @@ struct TORCH_API ProfilerDisableOptions { }; enum class C10_API_ENUM ProfilerState { - Disabled, - CPU, // CPU-only profiling - CUDA, // CPU + CUDA events - NVTX, // only emit NVTX markers + Disabled = 0, + CPU, // CPU-only profiling + CUDA, // CPU + CUDA events + NVTX, // only emit NVTX markers + KINETO, // use libkineto + NUM_PROFILER_STATES, // must be the last one +}; + +enum class C10_API_ENUM ActivityType { + CPU = 0, + CUDA_RUNTIME, // CUDA host events + CUDA, // CUDA kernels + NUM_KINETO_ACTIVITIES, // must be the last one }; struct TORCH_API ProfilerConfig { @@ -238,6 +247,10 @@ struct TORCH_API Event final { return cpu_ns_ / (1000.0); } + void setCpuUS(double cpu_us) { + cpu_ns_ = (int64_t)(cpu_us * 1000); + } + double cudaElapsedUs(const Event& e) const; bool hasCuda() const { @@ -248,6 +261,10 @@ struct TORCH_API Event final { return device_; } + void setDevice(int device) { + device_ = device; + } + void updateMemoryStats(int64_t alloc_size, c10::Device device) { if (device.type() == c10::DeviceType::CUDA || device.type() == c10::DeviceType::HIP) { @@ -303,6 +320,14 @@ struct TORCH_API Event final { return sequence_nr_; } + void setCorrelationId(uint64_t correlation_id) { + correlation_id_ = correlation_id; + } + + uint64_t correlationId() const { + return correlation_id_; + } + const std::vector& stack() const { return stack_; } @@ -347,6 +372,8 @@ struct TORCH_API Event final { std::vector stack_; uint8_t scope_; + + uint64_t correlation_id_; }; // a linked-list of fixed sized vectors, to avoid @@ -403,6 +430,11 @@ TORCH_API ProfilerConfig getProfilerConfig(); // Writes profiled events to a stream. TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector& events); +TORCH_API bool kinetoAvailable(); +TORCH_API void prepareProfiler( + const ProfilerConfig& new_config, + const std::set& activities); + // Usage: // { // RecordProfile guard("filename.trace"); From 662431b7b3804e111b7980018f641a1f7bac72f3 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 03:43:37 -0800 Subject: [PATCH 02/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/__init__.py | 2 + torch/autograd/profiler.py | 20 ++++-- torch/csrc/autograd/init.cpp | 5 +- torch/csrc/autograd/profiler.cpp | 115 +++++++++++++++++++------------ torch/csrc/autograd/profiler.h | 31 +++++---- 5 files changed, 107 insertions(+), 66 deletions(-) diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py index cec103ea4c8c..e2ccf47ce923 100644 --- a/torch/autograd/__init__.py +++ b/torch/autograd/__init__.py @@ -243,5 +243,7 @@ def variable(*args, **kwargs): # Import all native method/classes from torch._C._autograd import (ProfilerActivity, ProfilerState, ProfilerConfig, ProfilerEvent, + ProfilerResult, KinetoEvent, + _enable_profiler_legacy, _disable_profiler_legacy, _prepare_profiler, _enable_profiler, _disable_profiler, _profiler_enabled, _enable_record_function, _set_empty_test_observer, kineto_available) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 49742bb1e099..483802c7c9ec 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -325,6 +325,9 @@ class profile(object): with_stack (bool, optional): record source information (file and line number) for the ops + use_kineto (bool, default False): experimental support for Kineto profiler + skip_cpu (default False) - whether to skip profiling of CPU events + .. warning: Enabling memory profiling or source attribution incurs additional profiler overhead @@ -365,7 +368,8 @@ def __init__( record_shapes=False, profile_memory=False, with_stack=False, - use_kineto=False): + use_kineto=False, + skip_cpu=False): self.enabled = enabled if not self.enabled: return @@ -376,16 +380,22 @@ def __init__( self.profile_memory = profile_memory self.with_stack = with_stack self.use_kineto = use_kineto + self.skip_cpu = skip_cpu + if self.skip_cpu: + assert self.use_kineto, "skip_cpu is used with use_kineto=True" self.profiler_kind = None self.kineto_activities = [] if self.use_kineto: self.profiler_kind = torch.autograd.ProfilerState.KINETO - self.kineto_activities = [torch.autograd.ProfilerActivity.CPU] + if not self.skip_cpu: + self.kineto_activities = [torch.autograd.ProfilerActivity.CPU] + else: + self.kineto_activities = [] if self.use_cuda: self.kineto_activities += [ # uses CUPTI - torch.autograd.ProfilerActivity.CUDA_RUNTIME, + # torch.autograd.ProfilerActivity.CUDA_RUNTIME, torch.autograd.ProfilerActivity.CUDA] elif self.use_cuda: # legacy CUDA mode @@ -412,7 +422,9 @@ def __enter__(self): if self.entered: raise RuntimeError("autograd profiler traces are not reentrant") self.entered = True - torch.autograd._prepare_profiler(self.config, self.kineto_activities) + if self.use_kineto: + torch.autograd._prepare_profiler(self.config, self.kineto_activities) + torch.autograd._enable_profiler(self.config) return self diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 291ec75c79d4..c1b12ebd478b 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -44,7 +44,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { py::enum_(m, "ProfilerActivity") .value("CPU", ActivityType::CPU) - .value("CUDA_RUNTIME", ActivityType::CUDA_RUNTIME) + //.value("CUDA_RUNTIME", ActivityType::CUDA_RUNTIME) .value("CUDA", ActivityType::CUDA); py::class_(m, "ProfilerConfig") @@ -67,7 +67,8 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .def("is_remote", &LegacyEvent::isRemote) .def("sequence_nr", &LegacyEvent::sequenceNr) .def("stack", &LegacyEvent::stack) - .def("scope", &LegacyEvent::scope); + .def("scope", &LegacyEvent::scope) + .def("correlation_id", &LegacyEvent::correlationId); py::class_(m, "ProfilerResult") .def("kind", &LegacyEvent::kindStr) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index ef279830fdb5..c9fcf3efabb3 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -205,12 +205,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { std::make_move_iterator(remoteProfiledEvents_->begin()), std::make_move_iterator(remoteProfiledEvents_->end())); } - if (kinetoEvents_) { - result.insert( - result.end(), - std::make_move_iterator(kinetoEvents_->begin()), - std::make_move_iterator(kinetoEvents_->end())); - } return result; } @@ -227,6 +221,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { at::RecordFunction::currentThreadId(), include_cuda && config_.state == ProfilerState::CUDA); evt.setNodeId(at::RecordFunction::getDefaultNodeId()); + evt.setCorrelationId(peek_correlation_id()); getEventList().record(std::move(evt)); } } @@ -242,11 +237,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { } } - void setKinetoEvents(std::vector>&& kinetoEvents) { - std::lock_guard guard(state_mutex_); - kinetoEvents_ = std::move(kinetoEvents); - } - void pushRange( const at::RecordFunction& fn, const bool record_cuda, @@ -270,7 +260,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { evt.setSequenceNr(fn.seqNr()); evt.setFwdThreadId(fn.forwardThreadId()); evt.setScope((uint8_t)fn.scope()); - evt.setCorrelationId(peek_correlation_id()); #ifndef C10_MOBILE // backward nodes source range corresponds to the forward node // TODO: consider using C++ stack trace @@ -328,6 +317,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { thread_id, config_.state == ProfilerState::CUDA); evt.updateMemoryStats(alloc_size, device); + evt.setCorrelationId(peek_correlation_id()); getEventList(thread_id).record(std::move(evt)); } } @@ -433,7 +423,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled); at::CallbackHandle handle_ = 0; c10::optional>> remoteProfiledEvents_; - c10::optional>> kinetoEvents_; }; ProfilerThreadLocalState* getProfilerTLSState() { @@ -450,6 +439,12 @@ void pushProfilingCallbacks() { if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) { return; } +#ifdef USE_KINETO + if (state_ptr->config().state == ProfilerState::KINETO) { + libkineto::api().pushCorrelationId(next_correlation_id()); + return; + } +#endif bool record_cuda = state_ptr->config().state == ProfilerState::CUDA; if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) { @@ -476,28 +471,25 @@ void pushProfilingCallbacks() { } else { state_ptr->pushRange(fn, record_cuda, msg); } -#ifdef USE_KINETO - if (state_ptr->config().state == ProfilerState::KINETO) { - libkineto::api().pushCorrelationId(next_correlation_id()); - } -#endif }, [](const at::RecordFunction& fn) { auto state_ptr = getProfilerTLSState(); if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) { return; } +#ifdef USE_KINETO + if (state_ptr->config().state == ProfilerState::KINETO) { + // push new cpu trace event + libkineto::api().popCorrelationId(); + return; + } +#endif bool record_cuda = state_ptr->config().state == ProfilerState::CUDA; if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) { record_cuda = false; } state_ptr->popRange(fn, record_cuda); -#ifdef USE_KINETO - if (state_ptr->config().state == ProfilerState::KINETO) { - libkineto::api().popCorrelationId(); - } -#endif }) .needsInputs(state_ptr->config().report_input_shapes) .needsIds(true)); @@ -563,17 +555,18 @@ bool kinetoAvailable() { } void prepareProfiler( - const ProfilerConfig& new_config, + const ProfilerConfig& config, const std::set& activities) { #ifdef USE_KINETO - if (new_config.state == ProfilerState::KINETO) { + if (config.state == ProfilerState::KINETO) { std::set k_activities; if (activities.count(ActivityType::CPU)) { k_activities.insert(libkineto::ActivityType::EXTERNAL_CORRELATION); - } - if (activities.count(ActivityType::CUDA_RUNTIME)) { k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME); } + //if (activities.count(ActivityType::CUDA_RUNTIME)) { + // k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME); + //} if (activities.count(ActivityType::CUDA)) { k_activities.insert(libkineto::ActivityType::GPU_MEMCPY); k_activities.insert(libkineto::ActivityType::GPU_MEMSET); @@ -586,15 +579,18 @@ void prepareProfiler( } libkineto::api().initProfilerIfRegistered(); libkineto::api().prepareTrace(k_activities); + + return; } #endif + TORCH_CHECK(false, "Supported only in Kineto profiler"); } -void enableProfiler(const ProfilerConfig& new_config) { +void enableProfilerLegacy(const ProfilerConfig& new_config) { TORCH_CHECK(new_config.state != ProfilerState::NVTX || cuda_stubs->enabled(), "Can't use NVTX profiler - PyTorch was compiled without CUDA"); - TORCH_CHECK(new_config.state != ProfilerState::KINETO || kinetoAvailable()); + TORCH_CHECK(new_config.state != ProfilerState::KINETO); auto state_ptr = getProfilerTLSState(); TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread"); @@ -603,12 +599,6 @@ void enableProfiler(const ProfilerConfig& new_config) { pushProfilingCallbacks(); -#ifdef USE_KINETO - if (new_config.state == ProfilerState::KINETO) { - libkineto::api().startTrace(); - } -#endif - if (new_config.state == ProfilerState::CUDA) { // event recording appears to have some startup overhead, so we need to // to generate some dummy events first before recording synchronization events @@ -629,7 +619,7 @@ void enableProfiler(const ProfilerConfig& new_config) { state->mark("__start_profile", false); } -thread_event_lists disableProfiler(c10::optional profilerDisableOptions) { +thread_event_lists disableProfilerLegacy(c10::optional profilerDisableOptions) { auto cleanupTLSState = profilerDisableOptions ? profilerDisableOptions->cleanupTLSState : true; auto consolidate = profilerDisableOptions ? profilerDisableOptions->consolidate : true; // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard @@ -648,6 +638,46 @@ thread_event_lists disableProfiler(c10::optional profile at::removeCallback(state_ptr->callbackHandle()); } + if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) { + return thread_event_lists(); + } + + state_ptr->mark("__stop_profile"); + // Note that this will erase the underlying events. + return state_ptr->consolidate(); +} + +void enableProfiler(const ProfilerConfig& new_config) { + TORCH_CHECK(new_config.state == ProfilerState::KINETO && kinetoAvailable()); + + auto state_ptr = getProfilerTLSState(); + TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread"); + auto state = std::make_shared(new_config); + c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); + + pushProfilingCallbacks(); + +#ifdef USE_KINETO + if (new_config.state == ProfilerState::KINETO) { + libkineto::api().startTrace(); + } +#endif + + state->mark("__start_profile", false); +} + +ProfilerResult disableProfiler() { + // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard + auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); + + auto state_ptr = static_cast(state.get()); + TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled, + "Can't disable profiler when it's not running"); + + if (cleanupTLSState) { + at::removeCallback(state_ptr->callbackHandle()); + } + #ifdef USE_KINETO if (state_ptr->config().state == ProfilerState::KINETO) { auto k_events = libkineto::api().stopTrace(); @@ -660,8 +690,8 @@ thread_event_lists disableProfiler(c10::optional profile k_evt.threadId, false, k_evt.correlationId); - push_evt.setDevice(k_evt.deviceId); - push_evt.setCpuUS(k_evt.startUs); + push_evt.setDeviceId(k_evt.deviceId); + push_evt.setCpuUs(k_evt.startUs); push_evt.setCorrelationId(k_evt.correlationId); evt_list.emplace_back(std::move(push_evt)); @@ -671,8 +701,8 @@ thread_event_lists disableProfiler(c10::optional profile k_evt.threadId, false, k_evt.correlationId); - pop_evt.setDevice(k_evt.deviceId); - pop_evt.setCpuUS(k_evt.endUs); + pop_evt.setDeviceId(k_evt.deviceId); + pop_evt.setCpuUs(k_evt.endUs); pop_evt.setCorrelationId(k_evt.correlationId); evt_list.emplace_back(std::move(pop_evt)); } @@ -682,14 +712,9 @@ thread_event_lists disableProfiler(c10::optional profile events_list.emplace_back(it2.second); } } - state_ptr->setKinetoEvents(std::move(events_list)); } #endif - if (!consolidate || state_ptr->config().state == ProfilerState::NVTX) { - return thread_event_lists(); - } - state_ptr->mark("__stop_profile"); // Note that this will erase the underlying events. return state_ptr->consolidate(); diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 45058d7ef977..64d3d289e443 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -114,7 +114,7 @@ enum class C10_API_ENUM ProfilerState { enum class C10_API_ENUM ActivityType { CPU = 0, - CUDA_RUNTIME, // CUDA host events + // CUDA_RUNTIME, // CUDA host events CUDA, // CUDA kernels NUM_KINETO_ACTIVITIES, // must be the last one }; @@ -158,10 +158,21 @@ struct TORCH_API Event { return kind_; } + std::string kindStr() const { + switch (kind_) { + case EventKind::Mark: return "mark"; + case EventKind::PushRange: return "push"; + case EventKind::PopRange: return "pop"; + case EventKind::MemoryAlloc: return "memory_alloc"; + } + throw std::runtime_error("unknown event kind"); + } + protected: EventKind kind_; } +// To be deprecated, once we switch to Kineto profiling struct TORCH_API LegacyEvent : public Event { LegacyEvent( EventKind kind, @@ -223,15 +234,6 @@ struct TORCH_API LegacyEvent : public Event { static LegacyEvent fromIValue(const at::IValue& eventIValue); void record(bool record_cuda); - std::string kindStr() const { - switch (kind_) { - case EventKind::Mark: return "mark"; - case EventKind::PushRange: return "push"; - case EventKind::PopRange: return "pop"; - case EventKind::MemoryAlloc: return "memory_alloc"; - } - throw std::runtime_error("unknown EventKind"); - } const char* name() const { return name_.str(); @@ -373,10 +375,6 @@ struct TORCH_API LegacyEvent : public Event { uint64_t correlation_id_; }; -struct TORCH_API KinetoEvent : public Event { - -}; - // a linked-list of fixed sized vectors, to avoid // a std::vector resize from taking a large amount of time inside // a profiling event @@ -433,9 +431,12 @@ TORCH_API ProfilerConfig getProfilerConfig(); // Writes profiled events to a stream. TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector& events); +struct TORCH_API KinetoEvent : public Event { + +}; struct TORCH_API ProfilerResult { - thread_event_lists legacy_events_; // mem alloc, start/stop + thread_event_lists legacy_events_; // tensor mem alloc, start/stop std::vector> events_; }; From ea956aa6cecd817136fa84c6ec9d59999a6563d7 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 11:23:29 -0800 Subject: [PATCH 03/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- aten/src/ATen/record_function.cpp | 2 ++ torch/autograd/profiler.py | 7 +++++-- torch/csrc/autograd/profiler.cpp | 30 ++++++++++++++++++------------ torch/csrc/autograd/profiler.h | 4 +++- 4 files changed, 28 insertions(+), 15 deletions(-) diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp index 41f31968688d..d4d2c99d9c37 100644 --- a/aten/src/ATen/record_function.cpp +++ b/aten/src/ATen/record_function.cpp @@ -11,11 +11,13 @@ namespace { // Used to generate unique callback handles CallbackHandle next_unique_callback_handle() { static std::atomic unique_cb_id {0}; + // starts with 1 return CallbackHandle(++unique_cb_id); } RecordFunctionHandle next_unique_record_function_handle() { static std::atomic unique_rf_id {0}; + // starts with 1 return RecordFunctionHandle(++unique_rf_id); } diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 483802c7c9ec..0a4567b32eed 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -382,7 +382,8 @@ def __init__( self.use_kineto = use_kineto self.skip_cpu = skip_cpu if self.skip_cpu: - assert self.use_kineto, "skip_cpu is used with use_kineto=True" + assert self.use_kineto, \ + "skip_cpu is supported only with Kineto (use_kineto=True)" self.profiler_kind = None self.kineto_activities = [] @@ -397,6 +398,8 @@ def __init__( # uses CUPTI # torch.autograd.ProfilerActivity.CUDA_RUNTIME, torch.autograd.ProfilerActivity.CUDA] + assert len(self.kineto_activities) > 0, \ + "No activities specified for Kineto profiler" elif self.use_cuda: # legacy CUDA mode self.profiler_kind = torch.autograd.ProfilerState.CUDA @@ -425,7 +428,7 @@ def __enter__(self): if self.use_kineto: torch.autograd._prepare_profiler(self.config, self.kineto_activities) - torch.autograd._enable_profiler(self.config) + torch.autograd._enable_profiler(self.config, self.kineto_activities) return self def __exit__(self, exc_type, exc_val, exc_tb): diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index c9fcf3efabb3..9c15dd4279f4 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -175,9 +175,9 @@ struct FileLineFunc { thread_local size_t corr_id_ = 0; size_t next_correlation_id() { - return corr_id_++; + return ++corr_id_; } -size_t peek_correlation_id() { +size_t cur_correlation_id() { return corr_id_; } @@ -221,7 +221,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { at::RecordFunction::currentThreadId(), include_cuda && config_.state == ProfilerState::CUDA); evt.setNodeId(at::RecordFunction::getDefaultNodeId()); - evt.setCorrelationId(peek_correlation_id()); + evt.setCorrelationId(cur_correlation_id()); getEventList().record(std::move(evt)); } } @@ -317,7 +317,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { thread_id, config_.state == ProfilerState::CUDA); evt.updateMemoryStats(alloc_size, device); - evt.setCorrelationId(peek_correlation_id()); + evt.setCorrelationId(cur_correlation_id()); getEventList(thread_id).record(std::move(evt)); } } @@ -647,20 +647,26 @@ thread_event_lists disableProfilerLegacy(c10::optional p return state_ptr->consolidate(); } -void enableProfiler(const ProfilerConfig& new_config) { - TORCH_CHECK(new_config.state == ProfilerState::KINETO && kinetoAvailable()); +void enableProfiler( + const ProfilerConfig& config, + const std::set& activities) { + TORCH_CHECK(config.state == ProfilerState::KINETO && kinetoAvailable()); + TORCH_CHECK(!activities.empty(), "No activities specified for Kineto profiler"); auto state_ptr = getProfilerTLSState(); TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread"); - auto state = std::make_shared(new_config); + auto state = std::make_shared(config); c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); - pushProfilingCallbacks(); + if (activities.count(ActivityType::CPU)) { + pushProfilingCallbacks(); + } #ifdef USE_KINETO - if (new_config.state == ProfilerState::KINETO) { + while (!libkineto::api().traceActive()) { // sync? libkineto::api().startTrace(); } + //TORCH_CHECK(libkineto::api().traceActive()); #endif state->mark("__start_profile", false); @@ -671,10 +677,10 @@ ProfilerResult disableProfiler() { auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); auto state_ptr = static_cast(state.get()); - TORCH_CHECK(state_ptr && state_ptr->config().state != ProfilerState::Disabled, - "Can't disable profiler when it's not running"); + TORCH_CHECK(state_ptr && state_ptr->config().state == ProfilerState::KINETO, + "Can't disable Kineto profiler when it's not running"); - if (cleanupTLSState) { + if (state_ptr->callbackHandle() > 0) { at::removeCallback(state_ptr->callbackHandle()); } diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 64d3d289e443..d337841ad2ce 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -440,7 +440,9 @@ struct TORCH_API ProfilerResult { std::vector> events_; }; -TORCH_API void enableProfiler(const ProfilerConfig&); +TORCH_API void enableProfiler( + const ProfilerConfig& config, + const std::set& activities); TORCH_API ProfilerResult disableProfiler(); TORCH_API bool kinetoAvailable(); From 7dfdbc9a7d1e4364350ec7786f60d61d27242429 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 11:44:00 -0800 Subject: [PATCH 04/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- aten/src/ATen/record_function.cpp | 10 ++++------ torch/autograd/profiler.py | 13 +++++++------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/aten/src/ATen/record_function.cpp b/aten/src/ATen/record_function.cpp index d4d2c99d9c37..f48705796f7a 100644 --- a/aten/src/ATen/record_function.cpp +++ b/aten/src/ATen/record_function.cpp @@ -10,15 +10,13 @@ namespace { // Used to generate unique callback handles CallbackHandle next_unique_callback_handle() { - static std::atomic unique_cb_id {0}; - // starts with 1 - return CallbackHandle(++unique_cb_id); + static std::atomic unique_cb_id {1}; + return CallbackHandle(unique_cb_id++); } RecordFunctionHandle next_unique_record_function_handle() { - static std::atomic unique_rf_id {0}; - // starts with 1 - return RecordFunctionHandle(++unique_rf_id); + static std::atomic unique_rf_id {1}; + return RecordFunctionHandle(unique_rf_id++); } thread_local RecordFunctionTLS rf_tls_; diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 0a4567b32eed..fdbf059c5a95 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -326,7 +326,8 @@ class profile(object): with_stack (bool, optional): record source information (file and line number) for the ops use_kineto (bool, default False): experimental support for Kineto profiler - skip_cpu (default False) - whether to skip profiling of CPU events + + use_cpu (default True) - whether to profile CPU events .. warning: Enabling memory profiling or source attribution incurs additional profiler @@ -369,7 +370,7 @@ def __init__( profile_memory=False, with_stack=False, use_kineto=False, - skip_cpu=False): + use_cpu=True): self.enabled = enabled if not self.enabled: return @@ -380,16 +381,16 @@ def __init__( self.profile_memory = profile_memory self.with_stack = with_stack self.use_kineto = use_kineto - self.skip_cpu = skip_cpu - if self.skip_cpu: + self.use_cpu = use_cpu + if not self.use_cpu: assert self.use_kineto, \ - "skip_cpu is supported only with Kineto (use_kineto=True)" + "Device-only events supported only with Kineto (use_kineto=True)" self.profiler_kind = None self.kineto_activities = [] if self.use_kineto: self.profiler_kind = torch.autograd.ProfilerState.KINETO - if not self.skip_cpu: + if self.use_cpu: self.kineto_activities = [torch.autograd.ProfilerActivity.CPU] else: self.kineto_activities = [] From 67257785deea6b010db19a496e81fd9bbd282aa5 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 12:39:53 -0800 Subject: [PATCH 05/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/profiler.py | 19 +++-- torch/csrc/autograd/init.cpp | 17 +++- torch/csrc/autograd/profiler.h | 146 +++++++++++++++++++++++++++------ 3 files changed, 148 insertions(+), 34 deletions(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index fdbf059c5a95..ac8574b00ead 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -428,18 +428,23 @@ def __enter__(self): self.entered = True if self.use_kineto: torch.autograd._prepare_profiler(self.config, self.kineto_activities) - - torch.autograd._enable_profiler(self.config, self.kineto_activities) + torch.autograd._enable_profiler(self.config, self.kineto_activities) + else: + torch.autograd._enable_profiler_legacy(self.config) return self def __exit__(self, exc_type, exc_val, exc_tb): if not self.enabled: return - records = torch.autograd._disable_profiler() - self.function_events = EventList( - parse_event_records(records), - use_cuda=self.use_cuda, - profile_memory=self.profile_memory) + if self.use_kineto: + result = torch.autograd._disable_profiler() + self.function_events = parse_profiler_result(result) + else: + records = torch.autograd._disable_profiler_legacy() + self.function_events = EventList( + parse_event_records(records), + use_cuda=self.use_cuda, + profile_memory=self.profile_memory) if self.with_stack: self.function_events.set_backward_stacktraces() return False diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index c1b12ebd478b..4eb781b08885 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -70,9 +70,22 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .def("scope", &LegacyEvent::scope) .def("correlation_id", &LegacyEvent::correlationId); + py::class_(m, "KinetoEvent") + .def("name", &KinetoEvent::name) + .def("thread_id", &KinetoEvent::threadId) + .def("device_index", &KinetoEvent::deviceIndex) + .def("start_us", &KinetoEvent::startUs) + .def("duration", &KinetoEvent::duration) + .def("correlation_id", &KinetoEvent::correlationId) + .def("fwd_thread_id", &KinetoEvent::fwdThreadId) + .def("shapes", &KinetoEvent::shapes) + .def("sequence_nr", &KinetoEvent::sequenceNr) + .def("stack", &KinetoEvent::stack) + .def("scope", &KinetoEvent::scope); + py::class_(m, "ProfilerResult") - .def("kind", &LegacyEvent::kindStr) - .def("scope", &LegacyEvent::scope); + .def("events", &ProfilerResult::events) + .def("legacy_events", &ProfilerResult::legacy_events); m.def("kineto_available", kinetoAvailable); m.def("_enable_profiler", enableProfiler); diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index d337841ad2ce..ebda8ad0f4dd 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -23,6 +23,10 @@ struct CUevent_st; typedef std::shared_ptr CUDAEventStub; +namespace libkineto { +class TraceActivity; +} + namespace torch { namespace autograd { struct Node; @@ -148,32 +152,10 @@ enum class C10_API_ENUM EventKind : uint16_t { PushRange, PopRange, MemoryAlloc, - // - Kineto, }; -struct TORCH_API Event { - explicit Event(EventKind kind) : kind_(kind) {} - EventKind kind() const { - return kind_; - } - - std::string kindStr() const { - switch (kind_) { - case EventKind::Mark: return "mark"; - case EventKind::PushRange: return "push"; - case EventKind::PopRange: return "pop"; - case EventKind::MemoryAlloc: return "memory_alloc"; - } - throw std::runtime_error("unknown event kind"); - } - - protected: - EventKind kind_; -} - // To be deprecated, once we switch to Kineto profiling -struct TORCH_API LegacyEvent : public Event { +struct TORCH_API LegacyEvent { LegacyEvent( EventKind kind, at::StringView name, @@ -352,7 +334,18 @@ struct TORCH_API LegacyEvent : public Event { scope_ = scope; } + std::string kindStr() const { + switch (kind_) { + case EventKind::Mark: return "mark"; + case EventKind::PushRange: return "push"; + case EventKind::PopRange: return "pop"; + case EventKind::MemoryAlloc: return "memory_alloc"; + } + throw std::runtime_error("unknown event kind"); + } + private: + EventKind kind_; // signed to allow for negative intervals, initialized for safety. int64_t cpu_ns_ = 0; at::StringView name_; @@ -431,14 +424,117 @@ TORCH_API ProfilerConfig getProfilerConfig(); // Writes profiled events to a stream. TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector& events); -struct TORCH_API KinetoEvent : public Event { +enum class C10_API_ENUM KinetoDeviceType : uint16_t { + CPU = 0, + CUDA, + NUM_KINETO_DEVICE_TYPES, // must be the last one +}; + +struct TORCH_API KinetoEvent { + KinetoEvent(TraceActivity*) : activity_(activity) {} + + std::string name() const; + uint64_t deviceIndex() const; + uint64_t startUs() const; + uint64_t durationUs() const; + uint64_t correlationId() const; + + int64_t threadId() const { + return thread_id_; + } + + KinetoDeviceType deviceType() const { + return device_type_; + } + + int64_t fwdThreadId() const { + return fwd_thread_id_; + } + + const std::vector>& shapes() const { + return shapes_; + } + + int64_t sequenceNr() const { + return sequence_nr_; + } + + const std::vector& stack() const { + return stack_; + } + + uint8_t scope() const { + return scope_; + } + + KinetoEvent& threadId(int64_t thread_id) { + thread_id_ = thread_id; + return *this; + } + + KinetoEvent& deviceType(KinetoDeviceType device_type) { + device_type_ = device_type; + return *this; + } + + KinetoEvent& fwdThreadId(int64_t fwd_thread_id) { + fwd_thread_id_ = fwd_thread_id; + return *this; + } + + KinetoEvent& shapes(const std::vector>& shapes) { + shapes_ = shapes; + return *this; + } + + KinetoEvent& sequenceNr(int64_t sequence_nr) { + sequence_nr_ = sequence_nr_; + return *this; + } + + KinetoEvent& stack(const std::vector& st) { + stack_ = st; + return *this; + } + + KinetoEvent& scope(uint8_t scope_id) { + scope_id_ = scope_id; + return *this; + } + private: + //std::string name_; + //uint64_t device_index_; + //uint64_t start_us_; + //uint64_t duration_; + //uint64_t correlation_id_; + + TraceActivity* activity_ = nullptr; + int64_t thread_id_ = -1; + KinetoDeviceType device_type_ = KinetoDeviceType::CPU, + int64_t fwd_thread_id_ = -1; + std::vector> shapes_; + int64_t sequence_nr_ = -1; + std::vector stack_; + uint8_t scope_ = 0; }; struct TORCH_API ProfilerResult { - thread_event_lists legacy_events_; // tensor mem alloc, start/stop + ProfilerResult( + const std::vector>& events, + const thread_event_lists& legacy_events) + : events_(events), legacy_events_(legacy_events) {} + const std::vector> events() const { + return events_; + } + + const thread_event_lists& legacy_events() const { + return legacy_events_; + } + private: std::vector> events_; + thread_event_lists legacy_events_; // tensor mem alloc, start/stop }; TORCH_API void enableProfiler( const ProfilerConfig& config, From e9a219b213c57ec96ca210f9568e515200154398 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 12:47:15 -0800 Subject: [PATCH 06/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/profiler.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index ac8574b00ead..6fb37c2a25a8 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -777,7 +777,7 @@ class FunctionEvent(FormattedTimesMixin): def __init__( self, id, node_id, name, thread, cpu_start, cpu_end, fwd_thread=None, input_shapes=None, stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, - is_remote=True, sequence_nr=-1, device_id=-1): + is_remote=True, sequence_nr=-1): self.id: int = id self.node_id: int = node_id self.name: str = name @@ -796,7 +796,6 @@ def __init__( self.is_async: bool = is_async self.is_remote: bool = is_remote self.sequence_nr: int = sequence_nr - self.device_id: int = device_id def append_kernel(self, name, device, start, end): self.kernels.append(Kernel(name, device, Interval(start, end))) @@ -848,21 +847,15 @@ def self_cpu_time_total(self): @property def cuda_time_total(self): - if self.device_id >= 0: - return self.cpu_interval.elapsed_us() return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) @property def self_cuda_time_total(self): - if self.device_id >= 0: - return self.cuda_time_total - sum([child.cuda_time_total for child in self.cpu_children]) return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) - \ sum([child.cuda_time_total for child in self.cpu_children]) @property def cpu_time_total(self): - if self.device_id >= 0: - return 0 return self.cpu_interval.elapsed_us() @property @@ -1097,7 +1090,6 @@ def adjusted_time(cuda_record, cuda_records_map): is_async=is_async, is_remote=is_remote_event, sequence_nr=start.sequence_nr(), - device_id=start.device_id(), ) # note: async events have only cpu total time if not is_async and start.has_cuda(): From 49a9fee5761f28fd2a982c4e6f61ef1058493d9b Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 13:01:56 -0800 Subject: [PATCH 07/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 9c15dd4279f4..a69ec92dd033 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -326,6 +326,29 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { return config_.profile_memory; } + void reportKinetoClientActivity(const at::RecordFunction& fn) { +#ifdef USE_KINETO + if (config_.state == ProfilerState::KINETO) { + libkineto::ClientTraceActivity op; + op.startTime = libkineto::timeSinceEpoch(fc.startTime); + op.endTime = libkineto::timeSinceEpoch(now); + op.opType = fc.name; + op.device = fc.deviceType; + op.correlation = fc.correlationId; + op.threadId = pthread_self(); + op.inputDims = folly::toJson(fc.input_shapes); + op.inputTypes = folly::toJson(fc.input_types); + op.outputDims = "null"; + op.arguments = "null"; + op.outputTypes = "null"; + op.inputNames = "null"; + op.outputNames = "null"; + return; + } +#endif + TORCH_CHECK(false, "Supported only in Kineto profiler"); + } + private: std::vector prepareCallstack(const std::vector& cs) { std::vector entries; @@ -423,6 +446,10 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled); at::CallbackHandle handle_ = 0; c10::optional>> remoteProfiledEvents_; + +#ifdef USE_KINETO + std::vector kineto_client_activities_; +#endif }; ProfilerThreadLocalState* getProfilerTLSState() { @@ -479,7 +506,7 @@ void pushProfilingCallbacks() { } #ifdef USE_KINETO if (state_ptr->config().state == ProfilerState::KINETO) { - // push new cpu trace event + state_ptr->reportKinetoClientActivity(fn); libkineto::api().popCorrelationId(); return; } From 8edb34641d900ef84f73c3b116eeef66ad6b8a34 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 13:14:02 -0800 Subject: [PATCH 08/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index a69ec92dd033..7e27b6b69f4e 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -330,19 +330,18 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { #ifdef USE_KINETO if (config_.state == ProfilerState::KINETO) { libkineto::ClientTraceActivity op; - op.startTime = libkineto::timeSinceEpoch(fc.startTime); + /*op.startTime = libkineto::timeSinceEpoch(fc.startTime); op.endTime = libkineto::timeSinceEpoch(now); - op.opType = fc.name; + op.opType = std::string(fn.name()); op.device = fc.deviceType; op.correlation = fc.correlationId; op.threadId = pthread_self(); op.inputDims = folly::toJson(fc.input_shapes); - op.inputTypes = folly::toJson(fc.input_types); - op.outputDims = "null"; - op.arguments = "null"; - op.outputTypes = "null"; - op.inputNames = "null"; - op.outputNames = "null"; + op.inputTypes = folly::toJson(fc.input_types);*/ + { + std::lock_guard guard(state_mutex_); + kineto_client_activities_.emplace_back(std::move(op)); + } return; } #endif @@ -449,6 +448,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { #ifdef USE_KINETO std::vector kineto_client_activities_; + std::vector kineto_events_; #endif }; From f28862392a29ad92960bbfadf245ad4d0a78adc2 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 13:39:36 -0800 Subject: [PATCH 09/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 42 ++++++++++++++++++++++++++++++-- torch/csrc/autograd/profiler.h | 16 ++++-------- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 7e27b6b69f4e..3337b184e884 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -181,6 +181,41 @@ size_t cur_correlation_id() { return corr_id_; } +#ifdef USE_KINETO +struct KinetoEventImpl : public KinetoEvent { + static void fromClientActivity(const libkineto::ClientTraceActivity* activity) { + + } + + static void fromDeviceActivity(const libkineto::TraceActivity* activity) { + + } + + std::string name() const override { + + } + + uint64_t deviceIndex() const override { + + } + + uint64_t startUs() const override { + + } + + uint64_t durationUs() const override { + + } + + uint64_t correlationId() const override { + + } + + private: + libkineto::TraceActivity* activity_ptr_; +} +#endif + // Profiler state struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { explicit ProfilerThreadLocalState(const ProfilerConfig& config) @@ -340,7 +375,10 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { op.inputTypes = folly::toJson(fc.input_types);*/ { std::lock_guard guard(state_mutex_); - kineto_client_activities_.emplace_back(std::move(op)); + kineto_client_activities_.emplace_back(op); + kineto_events_.emplace_back( + KinetoEventImpl::fromClientActivity( + &(kineto_client_activities_.back()))); } return; } @@ -448,7 +486,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { #ifdef USE_KINETO std::vector kineto_client_activities_; - std::vector kineto_events_; + std::vector kineto_events_; #endif }; diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index ebda8ad0f4dd..f7461fbd9632 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -23,10 +23,6 @@ struct CUevent_st; typedef std::shared_ptr CUDAEventStub; -namespace libkineto { -class TraceActivity; -} - namespace torch { namespace autograd { struct Node; @@ -431,13 +427,11 @@ enum class C10_API_ENUM KinetoDeviceType : uint16_t { }; struct TORCH_API KinetoEvent { - KinetoEvent(TraceActivity*) : activity_(activity) {} - - std::string name() const; - uint64_t deviceIndex() const; - uint64_t startUs() const; - uint64_t durationUs() const; - uint64_t correlationId() const; + virtual std::string name() const = 0; + virtual uint64_t deviceIndex() const = 0; + virtual uint64_t startUs() const = 0; + virtual uint64_t durationUs() const = 0; + virtual uint64_t correlationId() const = 0; int64_t threadId() const { return thread_id_; From 979cdfa3bb5cadc74a58ba0617d2e9574d70fa7d Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 19:24:36 -0800 Subject: [PATCH 10/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 3337b184e884..10d8e5b6cb98 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -192,27 +192,27 @@ struct KinetoEventImpl : public KinetoEvent { } std::string name() const override { - + return activity_ptr_->name(); } uint64_t deviceIndex() const override { - + return activity_ptr_->deviceId(); } uint64_t startUs() const override { - + return activity_ptr_->timestamp(); } uint64_t durationUs() const override { - + return activity_ptr_->duration(); } uint64_t correlationId() const override { - + return activity_ptr_->correlationId(); } private: - libkineto::TraceActivity* activity_ptr_; + libkineto::TraceActivity* activity_ptr_ = nullptr; } #endif From c8cbeb00a1527aa4f1b983f4f840c94b27c09b0b Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 19:48:05 -0800 Subject: [PATCH 11/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.h | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index f7461fbd9632..18bfae272fb7 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -20,6 +20,8 @@ #include +#include + struct CUevent_st; typedef std::shared_ptr CUDAEventStub; @@ -420,12 +422,6 @@ TORCH_API ProfilerConfig getProfilerConfig(); // Writes profiled events to a stream. TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector& events); -enum class C10_API_ENUM KinetoDeviceType : uint16_t { - CPU = 0, - CUDA, - NUM_KINETO_DEVICE_TYPES, // must be the last one -}; - struct TORCH_API KinetoEvent { virtual std::string name() const = 0; virtual uint64_t deviceIndex() const = 0; @@ -437,7 +433,7 @@ struct TORCH_API KinetoEvent { return thread_id_; } - KinetoDeviceType deviceType() const { + c10::DeviceType deviceType() const { return device_type_; } @@ -466,7 +462,7 @@ struct TORCH_API KinetoEvent { return *this; } - KinetoEvent& deviceType(KinetoDeviceType device_type) { + KinetoEvent& deviceType(c10::DeviceType device_type) { device_type_ = device_type; return *this; } @@ -497,15 +493,8 @@ struct TORCH_API KinetoEvent { } private: - //std::string name_; - //uint64_t device_index_; - //uint64_t start_us_; - //uint64_t duration_; - //uint64_t correlation_id_; - - TraceActivity* activity_ = nullptr; int64_t thread_id_ = -1; - KinetoDeviceType device_type_ = KinetoDeviceType::CPU, + c10::DeviceType device_type_ = c10::DeviceType::CPU, int64_t fwd_thread_id_ = -1; std::vector> shapes_; int64_t sequence_nr_ = -1; From 226089cb423a3ca9e865f8015437a7107960e6d8 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 20:21:16 -0800 Subject: [PATCH 12/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 53 ++++++++++++++++++-------------- torch/csrc/autograd/profiler.h | 16 +++++++--- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 10d8e5b6cb98..d585a89704de 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -182,37 +182,44 @@ size_t cur_correlation_id() { } #ifdef USE_KINETO -struct KinetoEventImpl : public KinetoEvent { - static void fromClientActivity(const libkineto::ClientTraceActivity* activity) { - - } +std::string Kineto::name() const override { + return activity_->name(); +} - static void fromDeviceActivity(const libkineto::TraceActivity* activity) { +uint64_t Kineto::deviceIndex() const override { + return activity_->deviceId(); +} - } +uint64_t Kineto::startUs() const override { + return activity_->timestamp(); +} - std::string name() const override { - return activity_ptr_->name(); - } +uint64_t Kineto::durationUs() const override { + return activity_->duration(); +} - uint64_t deviceIndex() const override { - return activity_ptr_->deviceId(); - } +uint64_t Kineto::correlationId() const override { + return activity_->correlationId(); +} +#else +std::string Kineto::name() const override { + TORCH_CHECK(false, "Supported only with Kineto"); +} - uint64_t startUs() const override { - return activity_ptr_->timestamp(); - } +uint64_t Kineto::deviceIndex() const override { + TORCH_CHECK(false, "Supported only with Kineto"); +} - uint64_t durationUs() const override { - return activity_ptr_->duration(); - } +uint64_t Kineto::startUs() const override { + TORCH_CHECK(false, "Supported only with Kineto"); +} - uint64_t correlationId() const override { - return activity_ptr_->correlationId(); - } +uint64_t Kineto::durationUs() const override { + TORCH_CHECK(false, "Supported only with Kineto"); +} - private: - libkineto::TraceActivity* activity_ptr_ = nullptr; +uint64_t Kineto::correlationId() const override { + TORCH_CHECK(false, "Supported only with Kineto"); } #endif diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 18bfae272fb7..cbd125565cf8 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -423,11 +423,15 @@ TORCH_API ProfilerConfig getProfilerConfig(); TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector& events); struct TORCH_API KinetoEvent { - virtual std::string name() const = 0; - virtual uint64_t deviceIndex() const = 0; - virtual uint64_t startUs() const = 0; - virtual uint64_t durationUs() const = 0; - virtual uint64_t correlationId() const = 0; + KinetoEvent(std::unique_ptr&& activity) : activity_(activity) { + TORCH_CHECK(activity_); + } + + std::string name() const; + uint64_t deviceIndex() const; + uint64_t startUs() const; + uint64_t durationUs() const; + uint64_t correlationId() const; int64_t threadId() const { return thread_id_; @@ -500,6 +504,8 @@ struct TORCH_API KinetoEvent { int64_t sequence_nr_ = -1; std::vector stack_; uint8_t scope_ = 0; + + std::unique_ptr activity_; }; struct TORCH_API ProfilerResult { From 266b75fb295099fc5e31b9da50e90d5e86c38d7c Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 20:33:06 -0800 Subject: [PATCH 13/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 129 ++++++++----------------------- torch/csrc/autograd/profiler.h | 30 +++++-- 2 files changed, 57 insertions(+), 102 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index d585a89704de..803d90964939 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -21,10 +21,6 @@ #include -#ifdef USE_KINETO -#include "libkineto.h" -#endif - namespace torch { namespace autograd { namespace profiler { namespace { @@ -181,48 +177,6 @@ size_t cur_correlation_id() { return corr_id_; } -#ifdef USE_KINETO -std::string Kineto::name() const override { - return activity_->name(); -} - -uint64_t Kineto::deviceIndex() const override { - return activity_->deviceId(); -} - -uint64_t Kineto::startUs() const override { - return activity_->timestamp(); -} - -uint64_t Kineto::durationUs() const override { - return activity_->duration(); -} - -uint64_t Kineto::correlationId() const override { - return activity_->correlationId(); -} -#else -std::string Kineto::name() const override { - TORCH_CHECK(false, "Supported only with Kineto"); -} - -uint64_t Kineto::deviceIndex() const override { - TORCH_CHECK(false, "Supported only with Kineto"); -} - -uint64_t Kineto::startUs() const override { - TORCH_CHECK(false, "Supported only with Kineto"); -} - -uint64_t Kineto::durationUs() const override { - TORCH_CHECK(false, "Supported only with Kineto"); -} - -uint64_t Kineto::correlationId() const override { - TORCH_CHECK(false, "Supported only with Kineto"); -} -#endif - // Profiler state struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { explicit ProfilerThreadLocalState(const ProfilerConfig& config) @@ -502,7 +456,7 @@ ProfilerThreadLocalState* getProfilerTLSState() { return dynamic_cast(state.get()); } -void pushProfilingCallbacks() { +void pushProfilingCallbacksLegacy() { auto state_ptr = getProfilerTLSState(); TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set"); auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback( @@ -511,12 +465,6 @@ void pushProfilingCallbacks() { if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) { return; } -#ifdef USE_KINETO - if (state_ptr->config().state == ProfilerState::KINETO) { - libkineto::api().pushCorrelationId(next_correlation_id()); - return; - } -#endif bool record_cuda = state_ptr->config().state == ProfilerState::CUDA; if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) { @@ -549,13 +497,6 @@ void pushProfilingCallbacks() { if (!state_ptr || state_ptr->config().state == ProfilerState::Disabled) { return; } -#ifdef USE_KINETO - if (state_ptr->config().state == ProfilerState::KINETO) { - state_ptr->reportKinetoClientActivity(fn); - libkineto::api().popCorrelationId(); - return; - } -#endif bool record_cuda = state_ptr->config().state == ProfilerState::CUDA; if (record_cuda && disable_cuda_profiling.find(fn.name().str()) != disable_cuda_profiling.end()) { @@ -568,6 +509,32 @@ void pushProfilingCallbacks() { state_ptr->setCallbackHandle(handle); } +#ifdef USE_KINETO +void pushProfilingCallbacks() { + auto state_ptr = getProfilerTLSState(); + TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set"); + auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback( + [](const at::RecordFunction& fn) { + auto state_ptr = getProfilerTLSState(); + if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) { + return; + } + + libkineto::api().pushCorrelationId(next_correlation_id()); + }, + [](const at::RecordFunction& fn) { + auto state_ptr = getProfilerTLSState(); + if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) { + return; + } + state_ptr->reportKinetoClientActivity(fn); + libkineto::api().popCorrelationId(); + }) + .needsInputs(state_ptr->config().report_input_shapes) + .needsIds(true)); + state_ptr->setCallbackHandle(handle); +} + const int kCUDAWarmupStart = 5; } // namespace @@ -669,7 +636,7 @@ void enableProfilerLegacy(const ProfilerConfig& new_config) { auto state = std::make_shared(new_config); c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); - pushProfilingCallbacks(); + pushProfilingCallbacksLegacy(); if (new_config.state == ProfilerState::CUDA) { // event recording appears to have some startup overhead, so we need to @@ -719,6 +686,7 @@ thread_event_lists disableProfilerLegacy(c10::optional p return state_ptr->consolidate(); } +#ifdef USE_KINETO void enableProfiler( const ProfilerConfig& config, const std::set& activities) { @@ -734,12 +702,10 @@ void enableProfiler( pushProfilingCallbacks(); } -#ifdef USE_KINETO while (!libkineto::api().traceActive()) { // sync? libkineto::api().startTrace(); } //TORCH_CHECK(libkineto::api().traceActive()); -#endif state->mark("__start_profile", false); } @@ -756,47 +722,16 @@ ProfilerResult disableProfiler() { at::removeCallback(state_ptr->callbackHandle()); } -#ifdef USE_KINETO if (state_ptr->config().state == ProfilerState::KINETO) { - auto k_events = libkineto::api().stopTrace(); - std::unordered_map>> events; - for (auto& k_evt : k_events) { - auto& evt_list = events[k_evt.deviceId][k_evt.threadId]; - Event push_evt( - EventKind::PushRange, - at::StringView(k_evt.name), - k_evt.threadId, - false, - k_evt.correlationId); - push_evt.setDeviceId(k_evt.deviceId); - push_evt.setCpuUs(k_evt.startUs); - push_evt.setCorrelationId(k_evt.correlationId); - evt_list.emplace_back(std::move(push_evt)); - - Event pop_evt( - EventKind::PopRange, - at::StringView(k_evt.name), - k_evt.threadId, - false, - k_evt.correlationId); - pop_evt.setDeviceId(k_evt.deviceId); - pop_evt.setCpuUs(k_evt.endUs); - pop_evt.setCorrelationId(k_evt.correlationId); - evt_list.emplace_back(std::move(pop_evt)); - } - std::vector> events_list; - for (const auto& it : events) { - for (const auto& it2 : it.second) { - events_list.emplace_back(it2.second); - } - } + auto trace = libkineto::api().stopTrace(); + // } -#endif state_ptr->mark("__stop_profile"); // Note that this will erase the underlying events. return state_ptr->consolidate(); } +#endif void addEventList(std::vector&& profiledEvents) { auto state_ptr = getProfilerTLSState(); diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index cbd125565cf8..62984deb0ced 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -22,6 +22,10 @@ #include +#ifdef USE_KINETO +#include "libkineto.h" +#endif + struct CUevent_st; typedef std::shared_ptr CUDAEventStub; @@ -422,16 +426,31 @@ TORCH_API ProfilerConfig getProfilerConfig(); // Writes profiled events to a stream. TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector& events); +#ifdef USE_KINETO struct TORCH_API KinetoEvent { KinetoEvent(std::unique_ptr&& activity) : activity_(activity) { TORCH_CHECK(activity_); } - std::string name() const; - uint64_t deviceIndex() const; - uint64_t startUs() const; - uint64_t durationUs() const; - uint64_t correlationId() const; + std::string name() const override { + return activity_->name(); + } + + uint64_t deviceIndex() const override { + return activity_->deviceId(); + } + + uint64_t startUs() const override { + return activity_->timestamp(); + } + + uint64_t durationUs() const override { + return activity_->duration(); + } + + uint64_t correlationId() const override { + return activity_->correlationId(); + } int64_t threadId() const { return thread_id_; @@ -534,6 +553,7 @@ TORCH_API bool kinetoAvailable(); TORCH_API void prepareProfiler( const ProfilerConfig& config, const std::set& activities); +#endif // USE_KINETO // Usage: // { From 6958eac3863aafefcf565e5a4412bbd309060e42 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 20:34:13 -0800 Subject: [PATCH 14/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 42 ++++++++++++++++---------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 803d90964939..e0f5c195bc10 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -211,7 +211,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { if (config_.state == ProfilerState::NVTX) { cuda_stubs->nvtxMarkA(name.c_str()); } else { - Event evt( + LegacyEvent evt( EventKind::Mark, at::StringView(std::move(name)), at::RecordFunction::currentThreadId(), @@ -223,7 +223,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { } void setOrAddRemoteProfiledEvents( - std::vector&& remoteProfiledEvents) { + std::vector&& remoteProfiledEvents) { // Lock to serialize access from multiple callback threads. std::lock_guard guard(state_mutex_); if (remoteProfiledEvents_) { @@ -245,7 +245,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { cuda_stubs->nvtxRangePushA(getNvtxStr( fn.name(), msg, fn.seqNr(), shapes).c_str()); } else { - Event evt( + LegacyEvent evt( EventKind::PushRange, fn.name(), at::RecordFunction::currentThreadId(), @@ -282,7 +282,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { // called on a different thread than pushRange // As a convention, we put the async pop on the original // thread and save current thread id in pop event - Event evt( + LegacyEvent evt( EventKind::PopRange, at::StringView(""), at::RecordFunction::currentThreadId(), @@ -307,7 +307,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { c10::Device device) override { if (config_.profile_memory && config_.state != ProfilerState::Disabled) { uint64_t thread_id = at::RecordFunction::currentThreadId(); - Event evt( + LegacyEvent evt( EventKind::MemoryAlloc, at::StringView(""), thread_id, @@ -443,7 +443,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled); at::CallbackHandle handle_ = 0; - c10::optional>> remoteProfiledEvents_; + c10::optional>> remoteProfiledEvents_; #ifdef USE_KINETO std::vector kineto_client_activities_; @@ -733,13 +733,13 @@ ProfilerResult disableProfiler() { } #endif -void addEventList(std::vector&& profiledEvents) { +void addEventList(std::vector&& profiledEvents) { auto state_ptr = getProfilerTLSState(); TORCH_CHECK(state_ptr, "Profiler must be enabled."); state_ptr->setOrAddRemoteProfiledEvents(std::move(profiledEvents)); } -void Event::record(bool record_cuda) { +void LegacyEvent::record(bool record_cuda) { if (record_cuda) { cuda_stubs->record(&device_, &cuda_event, &cpu_ns_); return; @@ -747,7 +747,7 @@ void Event::record(bool record_cuda) { cpu_ns_ = getTime(); } -/* static */ Event Event::fromIValue(const at::IValue& eventIValue) { +/* static */ LegacyEvent LegacyEvent::fromIValue(const at::IValue& eventIValue) { TORCH_INTERNAL_ASSERT( eventIValue.isList(), "Expected IValue to contain type c10::impl::GenericList"); @@ -756,7 +756,7 @@ void Event::record(bool record_cuda) { ivalues.size() >= NUM_EVENT_IVALUE_IDX, "Expected at least ", NUM_EVENT_IVALUE_IDX, - " elements to reconstruct Event."); + " elements to reconstruct LegacyEvent."); // Reconstruct input shapes from ivalues. auto shapeListIValue = ivalues.get(EventIValueIdx::SHAPES); @@ -782,7 +782,7 @@ void Event::record(bool record_cuda) { shapes.emplace_back(s); } - Event evt( + LegacyEvent evt( static_cast( ivalues.get(EventIValueIdx::KIND).toInt()), // EventKind at::StringView(ivalues.get(EventIValueIdx::NAME).toStringRef()), // name @@ -802,7 +802,7 @@ void Event::record(bool record_cuda) { return evt; } -at::IValue Event::toIValue() const { +at::IValue LegacyEvent::toIValue() const { c10::impl::GenericList eventIValueList(at::AnyType::get()); eventIValueList.reserve(NUM_EVENT_IVALUE_IDX); eventIValueList.emplace_back(static_cast(kind_)); @@ -834,7 +834,7 @@ at::IValue Event::toIValue() const { return at::IValue(eventIValueList); } -double Event::cudaElapsedUs(const Event& e) const { +double LegacyEvent::cudaElapsedUs(const LegacyEvent& e) const { TORCH_CHECK(e.hasCuda() && hasCuda(), "Events were not recorded for CUDA"); TORCH_CHECK( e.device() == device(), @@ -862,10 +862,10 @@ static jit::CodeTemplate event_template(R"( "args": {} })"); -void writeProfilerEventsToStream(std::ostream& out, const std::vector& events) { +void writeProfilerEventsToStream(std::ostream& out, const std::vector& events) { TORCH_CHECK(out, "Could not open file"); - Event* profiler_start = nullptr; - for (Event* e : events) { + LegacyEvent* profiler_start = nullptr; + for (LegacyEvent* e : events) { if (0 == strcmp(e->name(), "__start_profile")) { profiler_start = e; break; @@ -879,10 +879,10 @@ void writeProfilerEventsToStream(std::ostream& out, const std::vector& e return std::hash()(p.first) ^ std::hash()(p.second); } }; - std::unordered_map, Event*, PairHash> events_map; + std::unordered_map, LegacyEvent*, PairHash> events_map; out << "[\n"; bool first = true; - for (Event* evt : events) { + for (LegacyEvent* evt : events) { if (evt->kind() == "push") { events_map[std::make_pair(evt->handle(), evt->nodeId())] = evt; } else if (evt->kind() == "pop") { @@ -892,7 +892,7 @@ void writeProfilerEventsToStream(std::ostream& out, const std::vector& e first = false; auto it = events_map.find(std::make_pair(evt->handle(), evt->nodeId())); TORCH_CHECK(it != events_map.end(), "Unmatched pop event"); - Event* evt_start = it->second; + LegacyEvent* evt_start = it->second; events_map.erase(it); jit::TemplateEnv env; @@ -923,7 +923,7 @@ void RecordProfile::init() { RecordProfile::~RecordProfile() { thread_event_lists event_lists = disableProfiler(); - std::vector events; + std::vector events; for (auto& l : event_lists) { for (auto& e : l) { events.push_back(&e); @@ -935,7 +935,7 @@ RecordProfile::~RecordProfile() { } } -void RecordProfile::processEvents(const std::vector& events) { +void RecordProfile::processEvents(const std::vector& events) { writeProfilerEventsToStream(out_, events); } From 97e5070d0277d807fbff7a5678ad432fb7945ece Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 21:00:00 -0800 Subject: [PATCH 15/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 6 +-- torch/csrc/autograd/profiler.h | 76 ++++++++++++++++++++------------ 2 files changed, 50 insertions(+), 32 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index e0f5c195bc10..c4eae2dbcfc9 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -534,6 +534,7 @@ void pushProfilingCallbacks() { .needsIds(true)); state_ptr->setCallbackHandle(handle); } +#endif const int kCUDAWarmupStart = 5; @@ -690,7 +691,7 @@ thread_event_lists disableProfilerLegacy(c10::optional p void enableProfiler( const ProfilerConfig& config, const std::set& activities) { - TORCH_CHECK(config.state == ProfilerState::KINETO && kinetoAvailable()); + TORCH_CHECK(config.state == ProfilerState::KINETO); TORCH_CHECK(!activities.empty(), "No activities specified for Kineto profiler"); auto state_ptr = getProfilerTLSState(); @@ -702,10 +703,9 @@ void enableProfiler( pushProfilingCallbacks(); } - while (!libkineto::api().traceActive()) { // sync? + if (!libkineto::api().traceActive()) { libkineto::api().startTrace(); } - //TORCH_CHECK(libkineto::api().traceActive()); state->mark("__start_profile", false); } diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 62984deb0ced..10eae6efcc67 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -428,30 +428,6 @@ TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector< #ifdef USE_KINETO struct TORCH_API KinetoEvent { - KinetoEvent(std::unique_ptr&& activity) : activity_(activity) { - TORCH_CHECK(activity_); - } - - std::string name() const override { - return activity_->name(); - } - - uint64_t deviceIndex() const override { - return activity_->deviceId(); - } - - uint64_t startUs() const override { - return activity_->timestamp(); - } - - uint64_t durationUs() const override { - return activity_->duration(); - } - - uint64_t correlationId() const override { - return activity_->correlationId(); - } - int64_t threadId() const { return thread_id_; } @@ -515,16 +491,57 @@ struct TORCH_API KinetoEvent { return *this; } + // Kineto fields + + KinetoEvent& activity(const libkineto::TraceActivity& activity) { + name_ = activity.name(); + deviceIndex_ = activity.deviceId(); + startUs_ = activity.timestamp(); + durationUs_ = activity.duration(); + correlationId_ = activity.correlationId(); + return *this; + } + + std::string name() const { + return name_; + } + + uint64_t deviceIndex() const { + return deviceIndex_; + } + + uint64_t startUs() const { + return startUs_; + } + + uint64_t durationUs() const { + return durationUs_; + } + + uint64_t correlationId() const { + return correlationId_; + } + + KinetoEvent& correlationId(uint64_t correlationId) { + correlationId_ = correlationId; + return *this; + } + private: - int64_t thread_id_ = -1; + int64_t thread_id_ = 0; c10::DeviceType device_type_ = c10::DeviceType::CPU, - int64_t fwd_thread_id_ = -1; + int64_t fwd_thread_id_ = 0; std::vector> shapes_; - int64_t sequence_nr_ = -1; + int64_t sequence_nr_ = 0; std::vector stack_; uint8_t scope_ = 0; - std::unique_ptr activity_; + std::string name_; + uint64_t deviceIndex_ = 0; + uint64_t startUs_ = 0; + uint64_t durationUs_ = 0; + uint64_t correlationId_ = 0; + }; struct TORCH_API ProfilerResult { @@ -549,12 +566,13 @@ TORCH_API void enableProfiler( const std::set& activities); TORCH_API ProfilerResult disableProfiler(); -TORCH_API bool kinetoAvailable(); TORCH_API void prepareProfiler( const ProfilerConfig& config, const std::set& activities); #endif // USE_KINETO +TORCH_API bool kinetoAvailable(); + // Usage: // { // RecordProfile guard("filename.trace"); From 8d111d282b96b7fa44c70a6d440d3f307e23b8c4 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 21:05:43 -0800 Subject: [PATCH 16/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/__init__.py | 8 +++++--- torch/csrc/autograd/init.cpp | 5 ++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py index e2ccf47ce923..d8d10ab4c2e6 100644 --- a/torch/autograd/__init__.py +++ b/torch/autograd/__init__.py @@ -243,7 +243,9 @@ def variable(*args, **kwargs): # Import all native method/classes from torch._C._autograd import (ProfilerActivity, ProfilerState, ProfilerConfig, ProfilerEvent, - ProfilerResult, KinetoEvent, - _enable_profiler_legacy, _disable_profiler_legacy, - _prepare_profiler, _enable_profiler, _disable_profiler, _profiler_enabled, + _enable_profiler_legacy, _disable_profiler_legacy, _profiler_enabled, _enable_record_function, _set_empty_test_observer, kineto_available) + +if kineto_available(): + from torch._C._autograd import (ProfilerResult, KinetoEvent, + _prepare_profiler, _enable_profiler, _disable_profiler) diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 4eb781b08885..153c815050fc 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -70,6 +70,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .def("scope", &LegacyEvent::scope) .def("correlation_id", &LegacyEvent::correlationId); +#ifdef USE_KINETO py::class_(m, "KinetoEvent") .def("name", &KinetoEvent::name) .def("thread_id", &KinetoEvent::threadId) @@ -87,10 +88,12 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .def("events", &ProfilerResult::events) .def("legacy_events", &ProfilerResult::legacy_events); - m.def("kineto_available", kinetoAvailable); m.def("_enable_profiler", enableProfiler); m.def("_disable_profiler", disableProfiler); m.def("_prepare_profiler", prepareProfiler); +#endif + + m.def("kineto_available", kinetoAvailable); m.def("_enable_profiler_legacy", enableProfilerLegacy); py::class_(m, "_ProfilerDisableOptions") From bfb03607bee9686a288c409f0d8e16f55afbf7bf Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 21:16:54 -0800 Subject: [PATCH 17/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.h | 37 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 10eae6efcc67..9df0c263c714 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -19,7 +19,6 @@ #endif #include - #include #ifdef USE_KINETO @@ -175,7 +174,7 @@ struct TORCH_API LegacyEvent { record(record_cuda); } - // Constructor to be used in conjunction with v::fromIValue. + // Constructor to be used in conjunction with LegacyEvent::fromIValue. LegacyEvent( EventKind kind, at::StringView name, @@ -219,6 +218,16 @@ struct TORCH_API LegacyEvent { void record(bool record_cuda); + std::string kindStr() const { + switch (kind_) { + case EventKind::Mark: return "mark"; + case EventKind::PushRange: return "push"; + case EventKind::PopRange: return "pop"; + case EventKind::MemoryAlloc: return "memory_alloc"; + } + throw std::runtime_error("unknown event kind"); + } + const char* name() const { return name_.str(); } @@ -239,16 +248,16 @@ struct TORCH_API LegacyEvent { return cpu_ns_ / (1000.0); } - void setCpuUs(double cpu_us) { - cpu_ns_ = (int64_t)(cpu_us * 1000); - } - double cudaElapsedUs(const LegacyEvent& e) const; bool hasCuda() const { return cuda_event != nullptr || (isRemote() && device_ != -1); } + int device() const { + return device_; + } + void updateMemoryStats(int64_t alloc_size, c10::Device device) { if (device.type() == c10::DeviceType::CUDA || device.type() == c10::DeviceType::HIP) { @@ -336,21 +345,11 @@ struct TORCH_API LegacyEvent { scope_ = scope; } - std::string kindStr() const { - switch (kind_) { - case EventKind::Mark: return "mark"; - case EventKind::PushRange: return "push"; - case EventKind::PopRange: return "pop"; - case EventKind::MemoryAlloc: return "memory_alloc"; - } - throw std::runtime_error("unknown event kind"); - } - private: - EventKind kind_; // signed to allow for negative intervals, initialized for safety. int64_t cpu_ns_ = 0; at::StringView name_; + EventKind kind_; uint64_t thread_id_; uint64_t fwd_thread_id_; at::RecordFunctionHandle handle_ {0}; @@ -366,7 +365,6 @@ struct TORCH_API LegacyEvent { std::vector stack_; uint8_t scope_; - uint64_t correlation_id_; }; @@ -412,7 +410,6 @@ struct RangeEventList { // NOTE: profiler mode is thread local, with automatic propagation // across thread boundary (e.g. at::launch tasks) TORCH_API void enableProfilerLegacy(const ProfilerConfig&); - using thread_event_lists = std::vector>; TORCH_API thread_event_lists disableProfilerLegacy(c10::optional profilerDisableOptions = c10::nullopt); @@ -541,7 +538,6 @@ struct TORCH_API KinetoEvent { uint64_t startUs_ = 0; uint64_t durationUs_ = 0; uint64_t correlationId_ = 0; - }; struct TORCH_API ProfilerResult { @@ -557,6 +553,7 @@ struct TORCH_API ProfilerResult { const thread_event_lists& legacy_events() const { return legacy_events_; } + private: std::vector> events_; thread_event_lists legacy_events_; // tensor mem alloc, start/stop From 1ff1a124ec35324b814c0f0c0be9ca6d2810378b Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 21:46:12 -0800 Subject: [PATCH 18/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 50 ++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index c4eae2dbcfc9..ee79c7f9cddd 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -169,14 +169,18 @@ struct FileLineFunc { std::string funcname; }; -thread_local size_t corr_id_ = 0; +thread_local size_t corr_id_ = 1; size_t next_correlation_id() { - return ++corr_id_; -} -size_t cur_correlation_id() { - return corr_id_; + return corr_id_++; } +#ifdef USE_KINETO +struct KinetoObserverContext : public at::ObserverContext { + int64_t startUs; + uint64_t correlationId; +}; +#endif + // Profiler state struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { explicit ProfilerThreadLocalState(const ProfilerConfig& config) @@ -217,7 +221,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { at::RecordFunction::currentThreadId(), include_cuda && config_.state == ProfilerState::CUDA); evt.setNodeId(at::RecordFunction::getDefaultNodeId()); - evt.setCorrelationId(cur_correlation_id()); getEventList().record(std::move(evt)); } } @@ -313,7 +316,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { thread_id, config_.state == ProfilerState::CUDA); evt.updateMemoryStats(alloc_size, device); - evt.setCorrelationId(cur_correlation_id()); getEventList(thread_id).record(std::move(evt)); } } @@ -322,24 +324,26 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { return config_.profile_memory; } - void reportKinetoClientActivity(const at::RecordFunction& fn) { + void reportKinetoClientActivity( + const at::RecordFunction& fn, + const KinetoObserverContext& ctx) { #ifdef USE_KINETO if (config_.state == ProfilerState::KINETO) { libkineto::ClientTraceActivity op; - /*op.startTime = libkineto::timeSinceEpoch(fc.startTime); - op.endTime = libkineto::timeSinceEpoch(now); + op.startTime = ctx.startUs; + op.endTime = (getTime() / 1000); op.opType = std::string(fn.name()); - op.device = fc.deviceType; - op.correlation = fc.correlationId; + op.device = 0; // CPU + op.correlation = ctx.correlationId; + /* op.threadId = pthread_self(); op.inputDims = folly::toJson(fc.input_shapes); - op.inputTypes = folly::toJson(fc.input_types);*/ + op.inputTypes = folly::toJson(fc.input_types); + */ { std::lock_guard guard(state_mutex_); kineto_client_activities_.emplace_back(op); - kineto_events_.emplace_back( - KinetoEventImpl::fromClientActivity( - &(kineto_client_activities_.back()))); + //kineto_events_.emplace_back(); } return; } @@ -520,14 +524,22 @@ void pushProfilingCallbacks() { return; } - libkineto::api().pushCorrelationId(next_correlation_id()); + auto corr_id = next_correlation_id(); + libkineto::api().pushCorrelationId(corr_id); + + auto ctx_ptr = std::make_unique(); + ctx_ptr->startUs = getTime() / 1000; + ctx_ptr->correlationId = corr_id; + return ctx_ptr; }, - [](const at::RecordFunction& fn) { + [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {) { auto state_ptr = getProfilerTLSState(); if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) { return; } - state_ptr->reportKinetoClientActivity(fn); + auto kineto_ctx_ptr = dynamic_cast(ctx_ptr); + TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr); + state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr); libkineto::api().popCorrelationId(); }) .needsInputs(state_ptr->config().report_input_shapes) From b3b69d8a1b7d2c577dbf81c37448520ffe7dadff Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 22:37:26 -0800 Subject: [PATCH 19/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 67 ++++++++++++++++++++++++--- torch/csrc/autograd/profiler.h | 77 ++++++++++++++++++++------------ 2 files changed, 109 insertions(+), 35 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index ee79c7f9cddd..834c7f99facd 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -169,8 +169,9 @@ struct FileLineFunc { std::string funcname; }; -thread_local size_t corr_id_ = 1; -size_t next_correlation_id() { +// TODO: figure if we can use TLS +std::atomic corr_id_ {1}; +uint64_t next_correlation_id() { return corr_id_++; } @@ -178,6 +179,13 @@ size_t next_correlation_id() { struct KinetoObserverContext : public at::ObserverContext { int64_t startUs; uint64_t correlationId; + uint64_t startThreadId; + uint64_t endThreadId; + std::vector> shapes; + int64_t sequenceNr; + uint64_t fwdThreadId; + uint8_t recFunScope; + std::vector stack; }; #endif @@ -324,7 +332,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { return config_.profile_memory; } - void reportKinetoClientActivity( + KinetoEvent& reportKinetoClientActivity( const at::RecordFunction& fn, const KinetoObserverContext& ctx) { #ifdef USE_KINETO @@ -335,15 +343,22 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { op.opType = std::string(fn.name()); op.device = 0; // CPU op.correlation = ctx.correlationId; + //op.inputDims = toStr(ctx.shapes); // /* op.threadId = pthread_self(); - op.inputDims = folly::toJson(fc.input_shapes); - op.inputTypes = folly::toJson(fc.input_types); */ + { std::lock_guard guard(state_mutex_); kineto_client_activities_.emplace_back(op); - //kineto_events_.emplace_back(); + kineto_events_.emplace_back(); + kineto_events_.back() + .startThreadId(ctx.startThreadId) + .endThreadId(ctx.endThreadId) + .sequenceNr(ctx.sequenceNr) + .fwdThreadId(ctx.fwdThreadId) + .scope(ctx.recFunScope) + .stack(stack); // } return; } @@ -530,6 +545,43 @@ void pushProfilingCallbacks() { auto ctx_ptr = std::make_unique(); ctx_ptr->startUs = getTime() / 1000; ctx_ptr->correlationId = corr_id; + ctx_ptr->startThreadId = at::RecordFunction::currentThreadId(); + + if (state_ptr->config().report_input_shapes) { + std::vector> inputSizes; + inputSizes.reserve(fn.inputs().size()); + for (const c10::IValue& input : fn.inputs()) { + if (!input.isTensor()) { + inputSizes.emplace_back(); + continue; + } + const at::Tensor& tensor = input.toTensor(); + if (tensor.defined()) { + inputSizes.push_back(input.toTensor().sizes().vec()); + } else { + inputSizes.emplace_back(); + } + } + ctx_ptr->shapes = inputSizes; + } + + ctx_ptr->sequenceNr = fn.seqNr(); + ctx_ptr->fwdThreadId = fn.forwardThreadId(); + ctx_ptr->recFunScope = (uint8_t)fn.scope(); + +#ifndef C10_MOBILE + // backward nodes source range corresponds to the forward node + // TODO: consider using C++ stack trace + if (state_ptr->config().with_stack && + fn.scope() != at::RecordScope::BACKWARD_FUNCTION) { + auto cs = prepareCallstack(jit::currentCallstack()); + if (cs.empty()) { + cs = prepareCallstack(jit::tracer::pythonCallstack()); + } + ctx_ptr->stack = callstackStr(cs); + } +#endif + return ctx_ptr; }, [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {) { @@ -539,6 +591,9 @@ void pushProfilingCallbacks() { } auto kineto_ctx_ptr = dynamic_cast(ctx_ptr); TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr); + + kineto_ctx_ptr->endThreadId = at::RecordFunction::currentThreadId(); + state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr); libkineto::api().popCorrelationId(); }) diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 9df0c263c714..3fe99d7f54a8 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -425,36 +425,53 @@ TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector< #ifdef USE_KINETO struct TORCH_API KinetoEvent { - int64_t threadId() const { - return thread_id_; + uint64_t startThreadId() const { + return start_thread_id_; + } + + uint64_t endThreadId() const { + return end_thread_id_; } c10::DeviceType deviceType() const { return device_type_; } - int64_t fwdThreadId() const { + uint64_t fwdThreadId() const { return fwd_thread_id_; } + bool hasShapes() const { + return shapes_ != c10::nullopt; + } + const std::vector>& shapes() const { - return shapes_; + return *shapes_; } int64_t sequenceNr() const { return sequence_nr_; } + bool hasStack() const { + return stack_ != c10::nullopt; + } + const std::vector& stack() const { - return stack_; + return *stack_; } uint8_t scope() const { return scope_; } - KinetoEvent& threadId(int64_t thread_id) { - thread_id_ = thread_id; + KinetoEvent& startThreadId(uint64_t start_thread_id) { + start_thread_id_ = start_thread_id; + return *this; + } + + KinetoEvent& endThreadId(uint64_t end_thread_id) { + end_thread_id_ = end_thread_id; return *this; } @@ -463,13 +480,13 @@ struct TORCH_API KinetoEvent { return *this; } - KinetoEvent& fwdThreadId(int64_t fwd_thread_id) { + KinetoEvent& fwdThreadId(uint64_t fwd_thread_id) { fwd_thread_id_ = fwd_thread_id; return *this; } KinetoEvent& shapes(const std::vector>& shapes) { - shapes_ = shapes; + *shapes_ = shapes; return *this; } @@ -479,7 +496,7 @@ struct TORCH_API KinetoEvent { } KinetoEvent& stack(const std::vector& st) { - stack_ = st; + *stack_ = st; return *this; } @@ -492,10 +509,10 @@ struct TORCH_API KinetoEvent { KinetoEvent& activity(const libkineto::TraceActivity& activity) { name_ = activity.name(); - deviceIndex_ = activity.deviceId(); - startUs_ = activity.timestamp(); - durationUs_ = activity.duration(); - correlationId_ = activity.correlationId(); + device_index_ = activity.deviceId(); + start_us_ = activity.timestamp(); + duration_us_ = activity.duration(); + correlation_id_ = activity.correlationId(); return *this; } @@ -504,40 +521,42 @@ struct TORCH_API KinetoEvent { } uint64_t deviceIndex() const { - return deviceIndex_; + return device_index_; } uint64_t startUs() const { - return startUs_; + return start_us_; } uint64_t durationUs() const { - return durationUs_; + return duration_us_; } uint64_t correlationId() const { - return correlationId_; + return correlation_id_; } - KinetoEvent& correlationId(uint64_t correlationId) { - correlationId_ = correlationId; + KinetoEvent& correlationId(uint64_t correlation_id) { + correlation_id_ = correlation_id; return *this; } private: - int64_t thread_id_ = 0; - c10::DeviceType device_type_ = c10::DeviceType::CPU, - int64_t fwd_thread_id_ = 0; - std::vector> shapes_; + uint64_t start_thread_id_ = 0; + uint64_t end_thread_id_ = 0; + uint64_t fwd_thread_id_ = 0; int64_t sequence_nr_ = 0; - std::vector stack_; uint8_t scope_ = 0; + c10::DeviceType device_type_ = c10::DeviceType::CPU, + c10::optional>> shapes_; + c10::optional> stack_; + std::string name_; - uint64_t deviceIndex_ = 0; - uint64_t startUs_ = 0; - uint64_t durationUs_ = 0; - uint64_t correlationId_ = 0; + uint64_t device_index_ = 0; + uint64_t start_us_ = 0; + uint64_t duration_us_ = 0; + uint64_t correlation_id_ = 0; }; struct TORCH_API ProfilerResult { From 2faeb8a4e02680e129c1f0fed62922736103f7db Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 22:49:25 -0800 Subject: [PATCH 20/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 115 ++++++++++++++----------------- 1 file changed, 52 insertions(+), 63 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 834c7f99facd..841a293eae04 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -358,7 +358,9 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { .sequenceNr(ctx.sequenceNr) .fwdThreadId(ctx.fwdThreadId) .scope(ctx.recFunScope) - .stack(stack); // + if (!stack.empty()) { + kineto_events_.back().stack(stack); + } } return; } @@ -475,6 +477,24 @@ ProfilerThreadLocalState* getProfilerTLSState() { return dynamic_cast(state.get()); } +std::vector> inputSizes(const at::RecordFunction& fn) { + std::vector> sizes; + sizes.reserve(fn.inputs().size()); + for (const c10::IValue& input : fn.inputs()) { + if (!input.isTensor()) { + sizes.emplace_back(); + continue; + } + const at::Tensor& tensor = input.toTensor(); + if (tensor.defined()) { + sizes.push_back(input.toTensor().sizes().vec()); + } else { + sizes.emplace_back(); + } + } + return sizes; +} + void pushProfilingCallbacksLegacy() { auto state_ptr = getProfilerTLSState(); TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set"); @@ -492,20 +512,7 @@ void pushProfilingCallbacksLegacy() { auto* msg = (fn.seqNr() >= 0) ? ", seq = " : ""; if (state_ptr->config().report_input_shapes) { - std::vector> inputSizes; - inputSizes.reserve(fn.inputs().size()); - for (const c10::IValue& input : fn.inputs()) { - if (!input.isTensor()) { - inputSizes.emplace_back(); - continue; - } - const at::Tensor& tensor = input.toTensor(); - if (tensor.defined()) { - inputSizes.push_back(input.toTensor().sizes().vec()); - } else { - inputSizes.emplace_back(); - } - } + auto sizes = inputSizes(fn); state_ptr->pushRange(fn, record_cuda, msg, std::move(inputSizes)); } else { state_ptr->pushRange(fn, record_cuda, msg); @@ -548,21 +555,7 @@ void pushProfilingCallbacks() { ctx_ptr->startThreadId = at::RecordFunction::currentThreadId(); if (state_ptr->config().report_input_shapes) { - std::vector> inputSizes; - inputSizes.reserve(fn.inputs().size()); - for (const c10::IValue& input : fn.inputs()) { - if (!input.isTensor()) { - inputSizes.emplace_back(); - continue; - } - const at::Tensor& tensor = input.toTensor(); - if (tensor.defined()) { - inputSizes.push_back(input.toTensor().sizes().vec()); - } else { - inputSizes.emplace_back(); - } - } - ctx_ptr->shapes = inputSizes; + ctx_ptr->shapes = inputSizes(fn); } ctx_ptr->sequenceNr = fn.seqNr(); @@ -581,7 +574,6 @@ void pushProfilingCallbacks() { ctx_ptr->stack = callstackStr(cs); } #endif - return ctx_ptr; }, [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {) { @@ -661,38 +653,6 @@ bool kinetoAvailable() { #endif } -void prepareProfiler( - const ProfilerConfig& config, - const std::set& activities) { -#ifdef USE_KINETO - if (config.state == ProfilerState::KINETO) { - std::set k_activities; - if (activities.count(ActivityType::CPU)) { - k_activities.insert(libkineto::ActivityType::EXTERNAL_CORRELATION); - k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME); - } - //if (activities.count(ActivityType::CUDA_RUNTIME)) { - // k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME); - //} - if (activities.count(ActivityType::CUDA)) { - k_activities.insert(libkineto::ActivityType::GPU_MEMCPY); - k_activities.insert(libkineto::ActivityType::GPU_MEMSET); - k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL); - } - - if (!libkineto::api().hasProfilerRegistered()) { - libkineto::api().registerProfiler( - std::make_unique(false)); - } - libkineto::api().initProfilerIfRegistered(); - libkineto::api().prepareTrace(k_activities); - - return; - } -#endif - TORCH_CHECK(false, "Supported only in Kineto profiler"); -} - void enableProfilerLegacy(const ProfilerConfig& new_config) { TORCH_CHECK(new_config.state != ProfilerState::NVTX || cuda_stubs->enabled(), "Can't use NVTX profiler - PyTorch was compiled without CUDA"); @@ -755,6 +715,35 @@ thread_event_lists disableProfilerLegacy(c10::optional p } #ifdef USE_KINETO + +void prepareProfiler( + const ProfilerConfig& config, + const std::set& activities) { + TORCH_CHECK(config.state == ProfilerState::KINETO, + "Supported only in Kineto profiler"); + + std::set k_activities; + if (activities.count(ActivityType::CPU)) { + k_activities.insert(libkineto::ActivityType::EXTERNAL_CORRELATION); + k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME); + } + //if (activities.count(ActivityType::CUDA_RUNTIME)) { + // k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME); + //} + if (activities.count(ActivityType::CUDA)) { + k_activities.insert(libkineto::ActivityType::GPU_MEMCPY); + k_activities.insert(libkineto::ActivityType::GPU_MEMSET); + k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL); + } + + if (!libkineto::api().hasProfilerRegistered()) { + libkineto::api().registerProfiler( + std::make_unique(false)); + } + libkineto::api().initProfilerIfRegistered(); + libkineto::api().prepareTrace(k_activities); +} + void enableProfiler( const ProfilerConfig& config, const std::set& activities) { From 67c890da1ceec84b170838d9336a73b951a040db Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 22:50:29 -0800 Subject: [PATCH 21/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 841a293eae04..14770b06452d 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -778,14 +778,15 @@ ProfilerResult disableProfiler() { at::removeCallback(state_ptr->callbackHandle()); } + state_ptr->mark("__stop_profile"); + if (state_ptr->config().state == ProfilerState::KINETO) { auto trace = libkineto::api().stopTrace(); // } - state_ptr->mark("__stop_profile"); - // Note that this will erase the underlying events. - return state_ptr->consolidate(); + auto legacy_events = state_ptr->consolidate(); + // } #endif From ed8babeb6b413a52218875da7b11f36011f80179 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 22:53:07 -0800 Subject: [PATCH 22/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 14770b06452d..9b32895c2a4d 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -780,13 +780,10 @@ ProfilerResult disableProfiler() { state_ptr->mark("__stop_profile"); - if (state_ptr->config().state == ProfilerState::KINETO) { - auto trace = libkineto::api().stopTrace(); - // - } - + auto trace = libkineto::api().stopTrace(); + auto kineto_events = filterTrace(trace); auto legacy_events = state_ptr->consolidate(); - // + return ProfilerResult(kineto_events, legacy_events); } #endif From ffc11fdd1960707895124e3b184cbf3ef4270fc3 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Mon, 2 Nov 2020 23:48:01 -0800 Subject: [PATCH 23/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 114 +++++++++--------- torch/csrc/autograd/profiler.h | 10 +- .../rpc/request_callback_no_python.cpp | 4 +- 3 files changed, 64 insertions(+), 64 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 9b32895c2a4d..df13c11b7680 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -181,14 +181,42 @@ struct KinetoObserverContext : public at::ObserverContext { uint64_t correlationId; uint64_t startThreadId; uint64_t endThreadId; - std::vector> shapes; + c10::optional>> shapes; int64_t sequenceNr; uint64_t fwdThreadId; uint8_t recFunScope; - std::vector stack; + c10::optional> stack; }; #endif +std::vector prepareCallstack(const std::vector& cs) { + std::vector entries; + entries.reserve(cs.size()); + for (const auto& entry : cs) { + auto& range = entry.range; + if (range.source()) { + auto& src = range.source(); + if (src && src->filename()) { + auto line = src->starting_line_no() + + src->lineno_for_offset(range.start()); + entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename}); + } + } + } + return entries; +} + +std::vector callstackStr(const std::vector& cs) { + std::vector cs_str; + cs_str.reserve(cs.size()); + for (const auto& entry : cs) { + std::stringstream loc; + loc << entry.filename << "(" << entry.line << "): " << entry.funcname; + cs_str.push_back(loc.str()); + } + return cs_str; +} + // Profiler state struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { explicit ProfilerThreadLocalState(const ProfilerConfig& config) @@ -332,7 +360,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { return config_.profile_memory; } - KinetoEvent& reportKinetoClientActivity( + void reportKinetoClientActivity( const at::RecordFunction& fn, const KinetoObserverContext& ctx) { #ifdef USE_KINETO @@ -340,26 +368,26 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { libkineto::ClientTraceActivity op; op.startTime = ctx.startUs; op.endTime = (getTime() / 1000); - op.opType = std::string(fn.name()); + op.opType = std::string(fn.name().str()); op.device = 0; // CPU op.correlation = ctx.correlationId; - //op.inputDims = toStr(ctx.shapes); // - /* - op.threadId = pthread_self(); - */ + if (ctx.shapes && !ctx.shapes->empty()) { + //op.inputDims = toStr(*ctx.shapes); // + } + //op.threadId = pthread_self(); { std::lock_guard guard(state_mutex_); - kineto_client_activities_.emplace_back(op); + kineto_client_activities_.emplace_back(std::move(op)); kineto_events_.emplace_back(); kineto_events_.back() .startThreadId(ctx.startThreadId) .endThreadId(ctx.endThreadId) .sequenceNr(ctx.sequenceNr) .fwdThreadId(ctx.fwdThreadId) - .scope(ctx.recFunScope) - if (!stack.empty()) { - kineto_events_.back().stack(stack); + .scope(ctx.recFunScope); + if (ctx.stack && !ctx.stack->empty()) { + kineto_events_.back().stack(*ctx.stack); } } return; @@ -369,34 +397,6 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { } private: - std::vector prepareCallstack(const std::vector& cs) { - std::vector entries; - entries.reserve(cs.size()); - for (const auto& entry : cs) { - auto& range = entry.range; - if (range.source()) { - auto& src = range.source(); - if (src && src->filename()) { - auto line = src->starting_line_no() + - src->lineno_for_offset(range.start()); - entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename}); - } - } - } - return entries; - } - - std::vector callstackStr(const std::vector& cs) { - std::vector cs_str; - cs_str.reserve(cs.size()); - for (const auto& entry : cs) { - std::stringstream loc; - loc << entry.filename << "(" << entry.line << "): " << entry.funcname; - cs_str.push_back(loc.str()); - } - return cs_str; - } - std::string getNvtxStr( const at::StringView& name, const char* msg, @@ -468,7 +468,7 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { #ifdef USE_KINETO std::vector kineto_client_activities_; - std::vector kineto_events_; + std::vector kineto_events_; #endif }; @@ -513,7 +513,7 @@ void pushProfilingCallbacksLegacy() { auto* msg = (fn.seqNr() >= 0) ? ", seq = " : ""; if (state_ptr->config().report_input_shapes) { auto sizes = inputSizes(fn); - state_ptr->pushRange(fn, record_cuda, msg, std::move(inputSizes)); + state_ptr->pushRange(fn, record_cuda, msg, std::move(sizes)); } else { state_ptr->pushRange(fn, record_cuda, msg); } @@ -543,7 +543,7 @@ void pushProfilingCallbacks() { [](const at::RecordFunction& fn) { auto state_ptr = getProfilerTLSState(); if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) { - return; + return std::make_unique(); } auto corr_id = next_correlation_id(); @@ -576,7 +576,7 @@ void pushProfilingCallbacks() { #endif return ctx_ptr; }, - [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) {) { + [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) { auto state_ptr = getProfilerTLSState(); if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) { return; @@ -736,11 +736,12 @@ void prepareProfiler( k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL); } - if (!libkineto::api().hasProfilerRegistered()) { - libkineto::api().registerProfiler( - std::make_unique(false)); - } - libkineto::api().initProfilerIfRegistered(); + //if (!libkineto::api().hasProfilerRegistered()) { + // libkineto::api().registerProfiler( + // std::make_unique(false)); + //} + + //libkineto::api().initProfilerIfRegistered(); libkineto::api().prepareTrace(k_activities); } @@ -780,8 +781,9 @@ ProfilerResult disableProfiler() { state_ptr->mark("__stop_profile"); - auto trace = libkineto::api().stopTrace(); - auto kineto_events = filterTrace(trace); + //auto trace = std::move(libkineto::api().stopTrace()); + libkineto::api().stopTrace(); + std::vector> kineto_events; // = filterTrace(trace); auto legacy_events = state_ptr->consolidate(); return ProfilerResult(kineto_events, legacy_events); } @@ -904,7 +906,6 @@ double LegacyEvent::cudaElapsedUs(const LegacyEvent& e) const { CUDAStubs::~CUDAStubs() = default; - static jit::CodeTemplate event_template(R"( { "name": "${name}", @@ -937,9 +938,9 @@ void writeProfilerEventsToStream(std::ostream& out, const std::vectorkind() == "push") { + if (evt->kindStr() == "push") { events_map[std::make_pair(evt->handle(), evt->nodeId())] = evt; - } else if (evt->kind() == "pop") { + } else if (evt->kindStr() == "pop") { if (!first) { out << ",\n"; } @@ -960,7 +961,6 @@ void writeProfilerEventsToStream(std::ostream& out, const std::vector events; for (auto& l : event_lists) { for (auto& e : l) { diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 3fe99d7f54a8..04cab1327dd0 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -500,8 +500,8 @@ struct TORCH_API KinetoEvent { return *this; } - KinetoEvent& scope(uint8_t scope_id) { - scope_id_ = scope_id; + KinetoEvent& scope(uint8_t scope) { + scope_ = scope; return *this; } @@ -548,7 +548,7 @@ struct TORCH_API KinetoEvent { int64_t sequence_nr_ = 0; uint8_t scope_ = 0; - c10::DeviceType device_type_ = c10::DeviceType::CPU, + c10::DeviceType device_type_ = c10::DeviceType::CPU; c10::optional>> shapes_; c10::optional> stack_; @@ -625,10 +625,10 @@ struct TORCH_API TLSProfilerGuard { c10::nullopt) : cb_(std::move(resultCallback)), profilerDisableOptions_(std::move(profilerDisableOptions)) { - enableProfiler(cfg); + enableProfilerLegacy(cfg); } ~TLSProfilerGuard() { - thread_event_lists event_lists = disableProfiler(profilerDisableOptions_); + thread_event_lists event_lists = disableProfilerLegacy(profilerDisableOptions_); if (cb_) { try { (*cb_)(event_lists); diff --git a/torch/csrc/distributed/rpc/request_callback_no_python.cpp b/torch/csrc/distributed/rpc/request_callback_no_python.cpp index cdce84a5d10c..7b0b83f547e3 100644 --- a/torch/csrc/distributed/rpc/request_callback_no_python.cpp +++ b/torch/csrc/distributed/rpc/request_callback_no_python.cpp @@ -92,7 +92,7 @@ std::shared_ptr RequestCallbackNoPython::processMessage( if (serverProcessGlobalProfilerStateStackEntryPtr) { // Restore thread-local profiler state. ::torch::autograd::profiler::thread_event_lists event_lists = - ::torch::autograd::profiler::disableProfiler(); + ::torch::autograd::profiler::disableProfilerLegacy(); // Put thread_local event_lists into the process-global profiler // state. profiler::processglobal::pushResultRecursive( @@ -543,7 +543,7 @@ void RequestCallbackNoPython::processRpc( torch::autograd::profiler::ProfilerDisableOptions opts( false, true); auto event_lists = - torch::autograd::profiler::disableProfiler(opts); + torch::autograd::profiler::disableProfilerLegacy(opts); if (wrappedRpcResponseFuture->hasError()) { // Propagate error // No need to propagate remote events in the case of an error. From fe76b8416b6ab1369e466586a0166a59b68d8ac1 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 00:25:58 -0800 Subject: [PATCH 24/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- test/cpp/jit/test_misc.cpp | 12 ++++++------ torch/csrc/autograd/init.cpp | 17 +++++++++-------- torch/csrc/autograd/profiler.cpp | 14 ++++++++++++++ torch/csrc/autograd/profiler.h | 19 +++++++------------ 4 files changed, 36 insertions(+), 26 deletions(-) diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp index 9ed50daaa86c..fdd155edf0f7 100644 --- a/test/cpp/jit/test_misc.cpp +++ b/test/cpp/jit/test_misc.cpp @@ -2164,7 +2164,7 @@ TEST(TLSFutureCallbacksTest, Basic) { // Since we join here, we can ensure that all callbacks corresponding to // markCompleted() have finished. t.join(); - torch::autograd::profiler::disableProfiler(); + torch::autograd::profiler::disableProfilerLegacy(); } // then() with TLS State { @@ -2182,7 +2182,7 @@ TEST(TLSFutureCallbacksTest, Basic) { std::thread t([s1 = std::move(s1)]() { s1->markCompleted(); }); t.join(); s2->wait(); - torch::autograd::profiler::disableProfiler(); + torch::autograd::profiler::disableProfilerLegacy(); } } @@ -2204,10 +2204,10 @@ TEST(ProfilerDisableInCallbackTest, Basic) { // Don't cleanup TLSState, and just consolidate. auto opts = torch::autograd::profiler::ProfilerDisableOptions(false, true); auto thread_event_lists = - torch::autograd::profiler::disableProfiler(std::move(opts)); + torch::autograd::profiler::disableProfilerLegacy(std::move(opts)); // Ensure that the events from this thread are still profiled and we obtain // the expected in events in our consolidated list when calling - // disableProfiler(). + // disableProfilerLegacy(). bool found_ones = false; bool found_add = false; for (const auto& li : thread_event_lists) { @@ -2229,7 +2229,7 @@ TEST(ProfilerDisableInCallbackTest, Basic) { s1->addCallback(verifyProfilerCb); // Disable the profiler, but do not consolidate results in the main thread. auto opts = torch::autograd::profiler::ProfilerDisableOptions(true, false); - torch::autograd::profiler::disableProfiler(std::move(opts)); + torch::autograd::profiler::disableProfilerLegacy(std::move(opts)); std::thread t([s1 = std::move(s1)]() { s1->markCompleted(at::IValue(1)); }); t.join(); @@ -2243,7 +2243,7 @@ TEST(ProfilerDisableInCallbackTest, Basic) { // Runs callback inline s1->markCompleted(at::IValue(1)); opts = torch::autograd::profiler::ProfilerDisableOptions(true, false); - torch::autograd::profiler::disableProfiler(std::move(opts)); + torch::autograd::profiler::disableProfilerLegacy(std::move(opts)); } TEST(IValueKWargsTest, Basic) { diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 153c815050fc..b844c4349fc6 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -73,16 +73,17 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { #ifdef USE_KINETO py::class_(m, "KinetoEvent") .def("name", &KinetoEvent::name) - .def("thread_id", &KinetoEvent::threadId) + .def("start_thread_id", [](const KinetoEvent& e) { return e.startThreadId(); }) + .def("end_thread_id", [](const KinetoEvent& e) { return e.endThreadId(); }) .def("device_index", &KinetoEvent::deviceIndex) .def("start_us", &KinetoEvent::startUs) - .def("duration", &KinetoEvent::duration) - .def("correlation_id", &KinetoEvent::correlationId) - .def("fwd_thread_id", &KinetoEvent::fwdThreadId) - .def("shapes", &KinetoEvent::shapes) - .def("sequence_nr", &KinetoEvent::sequenceNr) - .def("stack", &KinetoEvent::stack) - .def("scope", &KinetoEvent::scope); + .def("duration_us", &KinetoEvent::durationUs) + .def("correlation_id", [](const KinetoEvent& e) { return e.correlationId(); }) + .def("fwd_thread_id", [](const KinetoEvent& e) { return e.fwdThreadId(); }) + .def("shapes", [](const KinetoEvent& e) { return e.shapes(); }) + .def("sequence_nr", [](const KinetoEvent& e) { return e.sequenceNr(); }) + .def("stack", [](const KinetoEvent& e) { return e.stack(); }) + .def("scope", [](const KinetoEvent& e) { return e.scope(); }); py::class_(m, "ProfilerResult") .def("events", &ProfilerResult::events) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index df13c11b7680..484e13b607eb 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -21,6 +21,10 @@ #include +#ifdef USE_KINETO +#include "libkineto.h" +#endif + namespace torch { namespace autograd { namespace profiler { namespace { @@ -787,6 +791,16 @@ ProfilerResult disableProfiler() { auto legacy_events = state_ptr->consolidate(); return ProfilerResult(kineto_events, legacy_events); } + +KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) { + name_ = activity.name(); + device_index_ = activity.deviceId(); + start_us_ = activity.timestamp(); + duration_us_ = activity.duration(); + correlation_id_ = activity.correlationId(); + return *this; +} + #endif void addEventList(std::vector&& profiledEvents) { diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 04cab1327dd0..7ecf3a45f05b 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -21,13 +21,15 @@ #include #include -#ifdef USE_KINETO -#include "libkineto.h" -#endif - struct CUevent_st; typedef std::shared_ptr CUDAEventStub; +#ifdef USE_KINETO +namespace libkineto { +class TraceActivity; +} +#endif + namespace torch { namespace autograd { struct Node; @@ -507,14 +509,7 @@ struct TORCH_API KinetoEvent { // Kineto fields - KinetoEvent& activity(const libkineto::TraceActivity& activity) { - name_ = activity.name(); - device_index_ = activity.deviceId(); - start_us_ = activity.timestamp(); - duration_us_ = activity.duration(); - correlation_id_ = activity.correlationId(); - return *this; - } + KinetoEvent& activity(const libkineto::TraceActivity& activity); std::string name() const { return name_; From 76ee80c437969f3e10fb28239cd5fa85a8ca5e18 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 08:48:28 -0800 Subject: [PATCH 25/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/init.cpp | 2 +- torch/csrc/autograd/profiler.cpp | 705 ++++++++---------------- torch/csrc/autograd/profiler.h | 331 ++++------- torch/csrc/autograd/profiler_kineto.cpp | 206 +++++++ torch/csrc/autograd/profiler_kineto.h | 186 +++++++ 5 files changed, 737 insertions(+), 693 deletions(-) create mode 100644 torch/csrc/autograd/profiler_kineto.cpp create mode 100644 torch/csrc/autograd/profiler_kineto.h diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index b844c4349fc6..56fefb103c37 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 484e13b607eb..6eb6b37d11f6 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -21,60 +21,35 @@ #include -#ifdef USE_KINETO -#include "libkineto.h" -#endif - namespace torch { namespace autograd { namespace profiler { -namespace { - -enum EventIValueIdx { - KIND = 0, - NAME, - THREAD_ID, - HANDLE, - NODE_ID, - CPU_MEM_USAGE, - CPU_NS, - CUDA_RECORDED, - CUDA_MEM_USAGE, - CUDA_DEVICE, - CUDA_US, - SHAPES, - NUM_EVENT_IVALUE_IDX // must be last in list -}; - -enum ProfilerIValueIdx { - STATE = 0, - REPORT_INPUT_SHAPES, - PROFILE_MEMORY, - NUM_PROFILER_CFG_IVALUE_IDX // must be last in list -}; - -const std::unordered_set disable_cuda_profiling = { - "aten::view", - "aten::t", - "aten::transpose", - "aten::stride", - "aten::empty", - "aten::empty_like", - "aten::empty_strided", - "aten::as_strided", - "aten::expand", - "aten::resize_", - "aten::squeeze", - "aten::unsqueeze", - "aten::slice", - "aten::_unsafe_view", - "aten::size" -}; +std::vector prepareCallstack(const std::vector& cs) { + std::vector entries; + entries.reserve(cs.size()); + for (const auto& entry : cs) { + auto& range = entry.range; + if (range.source()) { + auto& src = range.source(); + if (src && src->filename()) { + auto line = src->starting_line_no() + + src->lineno_for_offset(range.start()); + entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename}); + } + } + } + return entries; +} -CUDAStubs default_stubs; -constexpr CUDAStubs* default_stubs_addr = &default_stubs; -// Constant initialization, so it is guaranteed to be initialized before -// static initialization calls which may invoke registerCUDAMethods -static CUDAStubs* cuda_stubs = default_stubs_addr; +std::vector callstackStr(const std::vector& cs) { + std::vector cs_str; + cs_str.reserve(cs.size()); + for (const auto& entry : cs) { + std::stringstream loc; + loc << entry.filename << "(" << entry.line << "): " << entry.funcname; + cs_str.push_back(loc.str()); + } + return cs_str; +} // We decompose the profiler logic into the following components: // @@ -167,313 +142,254 @@ static CUDAStubs* cuda_stubs = default_stubs_addr; // - save profiling events into the profiling state // -struct FileLineFunc { - std::string filename; - size_t line; - std::string funcname; -}; - -// TODO: figure if we can use TLS -std::atomic corr_id_ {1}; -uint64_t next_correlation_id() { - return corr_id_++; -} - -#ifdef USE_KINETO -struct KinetoObserverContext : public at::ObserverContext { - int64_t startUs; - uint64_t correlationId; - uint64_t startThreadId; - uint64_t endThreadId; - c10::optional>> shapes; - int64_t sequenceNr; - uint64_t fwdThreadId; - uint8_t recFunScope; - c10::optional> stack; -}; -#endif - -std::vector prepareCallstack(const std::vector& cs) { - std::vector entries; - entries.reserve(cs.size()); - for (const auto& entry : cs) { - auto& range = entry.range; - if (range.source()) { - auto& src = range.source(); - if (src && src->filename()) { - auto line = src->starting_line_no() + - src->lineno_for_offset(range.start()); - entries.emplace_back(FileLineFunc{*(src->filename()), line, entry.filename}); - } - } - } - return entries; +namespace { +CUDAStubs default_stubs; +constexpr CUDAStubs* default_stubs_addr = &default_stubs; +// Constant initialization, so it is guaranteed to be initialized before +// static initialization calls which may invoke registerCUDAMethods +static CUDAStubs* cuda_stubs = default_stubs_addr; } -std::vector callstackStr(const std::vector& cs) { - std::vector cs_str; - cs_str.reserve(cs.size()); - for (const auto& entry : cs) { - std::stringstream loc; - loc << entry.filename << "(" << entry.line << "): " << entry.funcname; - cs_str.push_back(loc.str()); - } - return cs_str; +// Profiler state +inline const ProfilerConfig& ProfilerThreadLocalState::config() const { + return config_; } -// Profiler state -struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { - explicit ProfilerThreadLocalState(const ProfilerConfig& config) - : config_(config), remoteProfiledEvents_{c10::nullopt} {} - ~ProfilerThreadLocalState() override = default; +thread_event_lists ProfilerThreadLocalState::consolidate() { + std::lock_guard g(state_mutex_); + thread_event_lists result; + for (auto& kv : event_lists_map_) { + auto& list = kv.second; + result.emplace_back(list->consolidate()); + } + // Consolidate remote events if applicable as well. + if (remoteProfiledEvents_) { + result.insert( + result.end(), + std::make_move_iterator(remoteProfiledEvents_->begin()), + std::make_move_iterator(remoteProfiledEvents_->end())); + } + return result; +} - inline const ProfilerConfig& config() const { - return config_; +void ProfilerThreadLocalState::mark(std::string name, bool include_cuda) { + if (config_.state == ProfilerState::Disabled) { + return; } - - thread_event_lists consolidate() { - std::lock_guard g(state_mutex_); - thread_event_lists result; - for (auto& kv : event_lists_map_) { - auto& list = kv.second; - result.emplace_back(list->consolidate()); - } - // Consolidate remote events if applicable as well. - if (remoteProfiledEvents_) { - result.insert( - result.end(), - std::make_move_iterator(remoteProfiledEvents_->begin()), - std::make_move_iterator(remoteProfiledEvents_->end())); - } - return result; + if (config_.state == ProfilerState::NVTX) { + cuda_stubs->nvtxMarkA(name.c_str()); + } else { + LegacyEvent evt( + EventKind::Mark, + at::StringView(std::move(name)), + at::RecordFunction::currentThreadId(), + include_cuda && config_.state == ProfilerState::CUDA); + evt.setNodeId(at::RecordFunction::getDefaultNodeId()); + getEventList().record(std::move(evt)); } +} - void mark(std::string name, bool include_cuda = true) { - if (config_.state == ProfilerState::Disabled) { - return; - } - if (config_.state == ProfilerState::NVTX) { - cuda_stubs->nvtxMarkA(name.c_str()); - } else { - LegacyEvent evt( - EventKind::Mark, - at::StringView(std::move(name)), - at::RecordFunction::currentThreadId(), - include_cuda && config_.state == ProfilerState::CUDA); - evt.setNodeId(at::RecordFunction::getDefaultNodeId()); - getEventList().record(std::move(evt)); - } +void ProfilerThreadLocalState::setOrAddRemoteProfiledEvents( + std::vector&& remoteProfiledEvents) { + // Lock to serialize access from multiple callback threads. + std::lock_guard guard(state_mutex_); + if (remoteProfiledEvents_) { + (*remoteProfiledEvents_).emplace_back(remoteProfiledEvents); + } else { + remoteProfiledEvents_ = {std::move(remoteProfiledEvents)}; } +} - void setOrAddRemoteProfiledEvents( - std::vector&& remoteProfiledEvents) { - // Lock to serialize access from multiple callback threads. - std::lock_guard guard(state_mutex_); - if (remoteProfiledEvents_) { - (*remoteProfiledEvents_).emplace_back(remoteProfiledEvents); - } else { - remoteProfiledEvents_ = {std::move(remoteProfiledEvents)}; - } +void ProfilerThreadLocalState::pushRange( + const at::RecordFunction& fn, + const bool record_cuda, + const char* msg, + std::vector>&& shapes) { + if (config_.state == ProfilerState::Disabled) { + return; } - - void pushRange( - const at::RecordFunction& fn, - const bool record_cuda, - const char* msg = "", - std::vector>&& shapes = {}) { - if (config_.state == ProfilerState::Disabled) { - return; - } - if (config_.state == ProfilerState::NVTX) { - cuda_stubs->nvtxRangePushA(getNvtxStr( - fn.name(), msg, fn.seqNr(), shapes).c_str()); - } else { - LegacyEvent evt( - EventKind::PushRange, - fn.name(), - at::RecordFunction::currentThreadId(), - record_cuda, - fn.handle(), - std::move(shapes), - at::RecordFunction::getDefaultNodeId()); - evt.setSequenceNr(fn.seqNr()); - evt.setFwdThreadId(fn.forwardThreadId()); - evt.setScope((uint8_t)fn.scope()); + if (config_.state == ProfilerState::NVTX) { + cuda_stubs->nvtxRangePushA(getNvtxStr( + fn.name(), msg, fn.seqNr(), shapes).c_str()); + } else { + LegacyEvent evt( + EventKind::PushRange, + fn.name(), + at::RecordFunction::currentThreadId(), + record_cuda, + fn.handle(), + std::move(shapes), + at::RecordFunction::getDefaultNodeId()); + evt.setSequenceNr(fn.seqNr()); + evt.setFwdThreadId(fn.forwardThreadId()); + evt.setScope((uint8_t)fn.scope()); #ifndef C10_MOBILE - // backward nodes source range corresponds to the forward node - // TODO: consider using C++ stack trace - if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) { - auto cs = prepareCallstack(jit::currentCallstack()); - if (cs.empty()) { - cs = prepareCallstack(jit::tracer::pythonCallstack()); - } - evt.setStack(callstackStr(cs)); + // backward nodes source range corresponds to the forward node + // TODO: consider using C++ stack trace + if (config_.with_stack && fn.scope() != at::RecordScope::BACKWARD_FUNCTION) { + auto cs = prepareCallstack(jit::currentCallstack()); + if (cs.empty()) { + cs = prepareCallstack(jit::tracer::pythonCallstack()); } -#endif - getEventList().record(std::move(evt)); + evt.setStack(callstackStr(cs)); } +#endif + getEventList().record(std::move(evt)); } +} - void popRange(const at::RecordFunction& fn, const bool record_cuda) { - if (config_.state == ProfilerState::Disabled) { - return; - } - if (config_.state == ProfilerState::NVTX) { - cuda_stubs->nvtxRangePop(); - } else { - // In some cases RecordFunction (and popRange) may be - // called on a different thread than pushRange - // As a convention, we put the async pop on the original - // thread and save current thread id in pop event - LegacyEvent evt( - EventKind::PopRange, - at::StringView(""), - at::RecordFunction::currentThreadId(), - record_cuda, - fn.handle()); - evt.setNodeId(at::RecordFunction::getDefaultNodeId()); - getEventList(fn.threadId()).record(std::move(evt)); - } +void ProfilerThreadLocalState::popRange(const at::RecordFunction& fn, const bool record_cuda) { + if (config_.state == ProfilerState::Disabled) { + return; } - - void setCallbackHandle(at::CallbackHandle handle) { - handle_ = handle; + if (config_.state == ProfilerState::NVTX) { + cuda_stubs->nvtxRangePop(); + } else { + // In some cases RecordFunction (and popRange) may be + // called on a different thread than pushRange + // As a convention, we put the async pop on the original + // thread and save current thread id in pop event + LegacyEvent evt( + EventKind::PopRange, + at::StringView(""), + at::RecordFunction::currentThreadId(), + record_cuda, + fn.handle()); + evt.setNodeId(at::RecordFunction::getDefaultNodeId()); + getEventList(fn.threadId()).record(std::move(evt)); } +} - at::CallbackHandle callbackHandle() const { - return handle_; - } +void ProfilerThreadLocalState::setCallbackHandle(at::CallbackHandle handle) { + handle_ = handle; +} - void reportMemoryUsage( - void* /* unused */, - int64_t alloc_size, - c10::Device device) override { - if (config_.profile_memory && config_.state != ProfilerState::Disabled) { - uint64_t thread_id = at::RecordFunction::currentThreadId(); - LegacyEvent evt( - EventKind::MemoryAlloc, - at::StringView(""), - thread_id, - config_.state == ProfilerState::CUDA); - evt.updateMemoryStats(alloc_size, device); - getEventList(thread_id).record(std::move(evt)); - } - } +at::CallbackHandle ProfilerThreadLocalState::callbackHandle() const { + return handle_; +} - bool memoryProfilingEnabled() const override { - return config_.profile_memory; +void ProfilerThreadLocalState::reportMemoryUsage( + void* /* unused */, + int64_t alloc_size, + c10::Device device) { + if (config_.profile_memory && config_.state != ProfilerState::Disabled) { + uint64_t thread_id = at::RecordFunction::currentThreadId(); + LegacyEvent evt( + EventKind::MemoryAlloc, + at::StringView(""), + thread_id, + config_.state == ProfilerState::CUDA); + evt.updateMemoryStats(alloc_size, device); + getEventList(thread_id).record(std::move(evt)); } +} - void reportKinetoClientActivity( - const at::RecordFunction& fn, - const KinetoObserverContext& ctx) { -#ifdef USE_KINETO - if (config_.state == ProfilerState::KINETO) { - libkineto::ClientTraceActivity op; - op.startTime = ctx.startUs; - op.endTime = (getTime() / 1000); - op.opType = std::string(fn.name().str()); - op.device = 0; // CPU - op.correlation = ctx.correlationId; - if (ctx.shapes && !ctx.shapes->empty()) { - //op.inputDims = toStr(*ctx.shapes); // - } - //op.threadId = pthread_self(); - - { - std::lock_guard guard(state_mutex_); - kineto_client_activities_.emplace_back(std::move(op)); - kineto_events_.emplace_back(); - kineto_events_.back() - .startThreadId(ctx.startThreadId) - .endThreadId(ctx.endThreadId) - .sequenceNr(ctx.sequenceNr) - .fwdThreadId(ctx.fwdThreadId) - .scope(ctx.recFunScope); - if (ctx.stack && !ctx.stack->empty()) { - kineto_events_.back().stack(*ctx.stack); - } - } - return; - } -#endif - TORCH_CHECK(false, "Supported only in Kineto profiler"); - } +bool ProfilerThreadLocalState::memoryProfilingEnabled() const { + return config_.profile_memory; +} - private: - std::string getNvtxStr( - const at::StringView& name, - const char* msg, - int64_t sequence_nr, - const std::vector>& shapes) const { - if (sequence_nr >= 0 || shapes.size() > 0) { - std::stringstream s; +std::string ProfilerThreadLocalState::getNvtxStr( + const at::StringView& name, + const char* msg, + int64_t sequence_nr, + const std::vector>& shapes) const { + if (sequence_nr >= 0 || shapes.size() > 0) { + std::stringstream s; #ifdef __HIP_PLATFORM_HCC__ - s << name.str(); + s << name.str(); #endif - if (sequence_nr >= 0) { + if (sequence_nr >= 0) { #ifdef __HIP_PLATFORM_HCC__ - s << msg << sequence_nr; + s << msg << sequence_nr; #else - s << name.str() << msg << sequence_nr; + s << name.str() << msg << sequence_nr; #endif - } - if (shapes.size() > 0) { - s << ", sizes = ["; - for (size_t idx = 0; idx < shapes.size(); ++idx) { - if (shapes[idx].size() > 0) { - s << "["; - for (size_t dim = 0; dim < shapes[idx].size(); ++dim) { - s << shapes[idx][dim]; - if (dim < shapes[idx].size() - 1) { - s << ", "; - } + } + if (shapes.size() > 0) { + s << ", sizes = ["; + for (size_t idx = 0; idx < shapes.size(); ++idx) { + if (shapes[idx].size() > 0) { + s << "["; + for (size_t dim = 0; dim < shapes[idx].size(); ++dim) { + s << shapes[idx][dim]; + if (dim < shapes[idx].size() - 1) { + s << ", "; } - s << "]"; - } else { - s << "[]"; - } - if (idx < shapes.size() - 1) { - s << ", "; } + s << "]"; + } else { + s << "[]"; + } + if (idx < shapes.size() - 1) { + s << ", "; } - s << "]"; } - return s.str(); - } else { - return name.str(); + s << "]"; } + return s.str(); + } else { + return name.str(); } +} - RangeEventList& getEventList(int64_t thread_id = -1) { - if (thread_id < 0) { - thread_id = at::RecordFunction::currentThreadId(); - } - RangeEventList* list_ptr = nullptr; - std::lock_guard guard(state_mutex_); - auto it = event_lists_map_.find(thread_id); - if (it != event_lists_map_.end()) { - list_ptr = it->second.get(); - } else { - auto event_list = std::make_shared(); - event_lists_map_[thread_id] = event_list; - list_ptr = event_list.get(); - } - return *list_ptr; +RangeEventList& ProfilerThreadLocalState::getEventList(int64_t thread_id) { + if (thread_id < 0) { + thread_id = at::RecordFunction::currentThreadId(); } + RangeEventList* list_ptr = nullptr; + std::lock_guard guard(state_mutex_); + auto it = event_lists_map_.find(thread_id); + if (it != event_lists_map_.end()) { + list_ptr = it->second.get(); + } else { + auto event_list = std::make_shared(); + event_lists_map_[thread_id] = event_list; + list_ptr = event_list.get(); + } + return *list_ptr; +} + +namespace { - std::mutex state_mutex_; - std::unordered_map> - event_lists_map_; +enum EventIValueIdx { + KIND = 0, + NAME, + THREAD_ID, + HANDLE, + NODE_ID, + CPU_MEM_USAGE, + CPU_NS, + CUDA_RECORDED, + CUDA_MEM_USAGE, + CUDA_DEVICE, + CUDA_US, + SHAPES, + NUM_EVENT_IVALUE_IDX // must be last in list +}; - ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled); - at::CallbackHandle handle_ = 0; - c10::optional>> remoteProfiledEvents_; +enum ProfilerIValueIdx { + STATE = 0, + REPORT_INPUT_SHAPES, + PROFILE_MEMORY, + NUM_PROFILER_CFG_IVALUE_IDX // must be last in list +}; -#ifdef USE_KINETO - std::vector kineto_client_activities_; - std::vector kineto_events_; -#endif +const std::unordered_set disable_cuda_profiling = { + "aten::view", + "aten::t", + "aten::transpose", + "aten::stride", + "aten::empty", + "aten::empty_like", + "aten::empty_strided", + "aten::as_strided", + "aten::expand", + "aten::resize_", + "aten::squeeze", + "aten::unsqueeze", + "aten::slice", + "aten::_unsafe_view", + "aten::size" }; ProfilerThreadLocalState* getProfilerTLSState() { @@ -539,66 +455,6 @@ void pushProfilingCallbacksLegacy() { state_ptr->setCallbackHandle(handle); } -#ifdef USE_KINETO -void pushProfilingCallbacks() { - auto state_ptr = getProfilerTLSState(); - TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set"); - auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback( - [](const at::RecordFunction& fn) { - auto state_ptr = getProfilerTLSState(); - if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) { - return std::make_unique(); - } - - auto corr_id = next_correlation_id(); - libkineto::api().pushCorrelationId(corr_id); - - auto ctx_ptr = std::make_unique(); - ctx_ptr->startUs = getTime() / 1000; - ctx_ptr->correlationId = corr_id; - ctx_ptr->startThreadId = at::RecordFunction::currentThreadId(); - - if (state_ptr->config().report_input_shapes) { - ctx_ptr->shapes = inputSizes(fn); - } - - ctx_ptr->sequenceNr = fn.seqNr(); - ctx_ptr->fwdThreadId = fn.forwardThreadId(); - ctx_ptr->recFunScope = (uint8_t)fn.scope(); - -#ifndef C10_MOBILE - // backward nodes source range corresponds to the forward node - // TODO: consider using C++ stack trace - if (state_ptr->config().with_stack && - fn.scope() != at::RecordScope::BACKWARD_FUNCTION) { - auto cs = prepareCallstack(jit::currentCallstack()); - if (cs.empty()) { - cs = prepareCallstack(jit::tracer::pythonCallstack()); - } - ctx_ptr->stack = callstackStr(cs); - } -#endif - return ctx_ptr; - }, - [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) { - auto state_ptr = getProfilerTLSState(); - if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) { - return; - } - auto kineto_ctx_ptr = dynamic_cast(ctx_ptr); - TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr); - - kineto_ctx_ptr->endThreadId = at::RecordFunction::currentThreadId(); - - state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr); - libkineto::api().popCorrelationId(); - }) - .needsInputs(state_ptr->config().report_input_shapes) - .needsIds(true)); - state_ptr->setCallbackHandle(handle); -} -#endif - const int kCUDAWarmupStart = 5; } // namespace @@ -649,14 +505,6 @@ bool profilerEnabled() { return state_ptr && state_ptr->config().state != ProfilerState::Disabled; } -bool kinetoAvailable() { -#ifdef USE_KINETO - return true; -#else - return false; -#endif -} - void enableProfilerLegacy(const ProfilerConfig& new_config) { TORCH_CHECK(new_config.state != ProfilerState::NVTX || cuda_stubs->enabled(), "Can't use NVTX profiler - PyTorch was compiled without CUDA"); @@ -718,91 +566,6 @@ thread_event_lists disableProfilerLegacy(c10::optional p return state_ptr->consolidate(); } -#ifdef USE_KINETO - -void prepareProfiler( - const ProfilerConfig& config, - const std::set& activities) { - TORCH_CHECK(config.state == ProfilerState::KINETO, - "Supported only in Kineto profiler"); - - std::set k_activities; - if (activities.count(ActivityType::CPU)) { - k_activities.insert(libkineto::ActivityType::EXTERNAL_CORRELATION); - k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME); - } - //if (activities.count(ActivityType::CUDA_RUNTIME)) { - // k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME); - //} - if (activities.count(ActivityType::CUDA)) { - k_activities.insert(libkineto::ActivityType::GPU_MEMCPY); - k_activities.insert(libkineto::ActivityType::GPU_MEMSET); - k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL); - } - - //if (!libkineto::api().hasProfilerRegistered()) { - // libkineto::api().registerProfiler( - // std::make_unique(false)); - //} - - //libkineto::api().initProfilerIfRegistered(); - libkineto::api().prepareTrace(k_activities); -} - -void enableProfiler( - const ProfilerConfig& config, - const std::set& activities) { - TORCH_CHECK(config.state == ProfilerState::KINETO); - TORCH_CHECK(!activities.empty(), "No activities specified for Kineto profiler"); - - auto state_ptr = getProfilerTLSState(); - TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread"); - auto state = std::make_shared(config); - c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); - - if (activities.count(ActivityType::CPU)) { - pushProfilingCallbacks(); - } - - if (!libkineto::api().traceActive()) { - libkineto::api().startTrace(); - } - - state->mark("__start_profile", false); -} - -ProfilerResult disableProfiler() { - // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard - auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); - - auto state_ptr = static_cast(state.get()); - TORCH_CHECK(state_ptr && state_ptr->config().state == ProfilerState::KINETO, - "Can't disable Kineto profiler when it's not running"); - - if (state_ptr->callbackHandle() > 0) { - at::removeCallback(state_ptr->callbackHandle()); - } - - state_ptr->mark("__stop_profile"); - - //auto trace = std::move(libkineto::api().stopTrace()); - libkineto::api().stopTrace(); - std::vector> kineto_events; // = filterTrace(trace); - auto legacy_events = state_ptr->consolidate(); - return ProfilerResult(kineto_events, legacy_events); -} - -KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) { - name_ = activity.name(); - device_index_ = activity.deviceId(); - start_us_ = activity.timestamp(); - duration_us_ = activity.duration(); - correlation_id_ = activity.correlationId(); - return *this; -} - -#endif - void addEventList(std::vector&& profiledEvents) { auto state_ptr = getProfilerTLSState(); TORCH_CHECK(state_ptr, "Profiler must be enabled."); diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 7ecf3a45f05b..b2468b158694 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -19,17 +19,12 @@ #endif #include -#include + +#include struct CUevent_st; typedef std::shared_ptr CUDAEventStub; -#ifdef USE_KINETO -namespace libkineto { -class TraceActivity; -} -#endif - namespace torch { namespace autograd { struct Node; @@ -95,61 +90,6 @@ inline int64_t getTime() { #endif } -// A struct to control settings of disableProfiler options. -struct TORCH_API ProfilerDisableOptions { - ProfilerDisableOptions() = default; - ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate) - : cleanupTLSState(shouldCleanupTLSState), - consolidate(shouldConsolidate) {} - // Whether we should clean up profiler states that are thread local, such as - // ThreadLocalDebugInfo and thread local RecordFunction callbacks. - bool cleanupTLSState = true; - // Whether we should consolidate all currently recorded profiled events. If - // false, will not consolidate and other threads can continue to write to the - // event lists. - bool consolidate = true; -}; - -enum class C10_API_ENUM ProfilerState { - Disabled = 0, - CPU, // CPU-only profiling - CUDA, // CPU + CUDA events - NVTX, // only emit NVTX markers - KINETO, // use libkineto - NUM_PROFILER_STATES, // must be the last one -}; - -enum class C10_API_ENUM ActivityType { - CPU = 0, - // CUDA_RUNTIME, // CUDA host events - CUDA, // CUDA kernels - NUM_KINETO_ACTIVITIES, // must be the last one -}; - -struct TORCH_API ProfilerConfig { - ProfilerConfig( - ProfilerState state, - bool report_input_shapes = false, - bool profile_memory = false, - bool with_stack = false) - : state(state), - report_input_shapes(report_input_shapes), - profile_memory(profile_memory), - with_stack(with_stack) {} - ~ProfilerConfig(); - ProfilerState state; - bool report_input_shapes; - bool profile_memory; - bool with_stack; - - // Returns IValues corresponding to ProfilerConfig struct, to be used for - // serialization. - at::IValue toIValue() const; - - // Reconstructs a ProfilerConfig from IValues given by toIValue. - static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue); -}; - enum class C10_API_ENUM EventKind : uint16_t { Mark, PushRange, @@ -409,6 +349,54 @@ struct RangeEventList { static const size_t kReservedCapacity = 1024; }; +enum class C10_API_ENUM ProfilerState { + Disabled = 0, + CPU, // CPU-only profiling + CUDA, // CPU + CUDA events + NVTX, // only emit NVTX markers + KINETO, // use libkineto + NUM_PROFILER_STATES, // must be the last one +}; + +struct TORCH_API ProfilerConfig { + ProfilerConfig( + ProfilerState state, + bool report_input_shapes = false, + bool profile_memory = false, + bool with_stack = false) + : state(state), + report_input_shapes(report_input_shapes), + profile_memory(profile_memory), + with_stack(with_stack) {} + ~ProfilerConfig(); + ProfilerState state; + bool report_input_shapes; + bool profile_memory; + bool with_stack; + + // Returns IValues corresponding to ProfilerConfig struct, to be used for + // serialization. + at::IValue toIValue() const; + + // Reconstructs a ProfilerConfig from IValues given by toIValue. + static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue); +}; + +// A struct to control settings of disableProfiler options. +struct TORCH_API ProfilerDisableOptions { + ProfilerDisableOptions() = default; + ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate) + : cleanupTLSState(shouldCleanupTLSState), + consolidate(shouldConsolidate) {} + // Whether we should clean up profiler states that are thread local, such as + // ThreadLocalDebugInfo and thread local RecordFunction callbacks. + bool cleanupTLSState = true; + // Whether we should consolidate all currently recorded profiled events. If + // false, will not consolidate and other threads can continue to write to the + // event lists. + bool consolidate = true; +}; + // NOTE: profiler mode is thread local, with automatic propagation // across thread boundary (e.g. at::launch tasks) TORCH_API void enableProfilerLegacy(const ProfilerConfig&); @@ -425,165 +413,6 @@ TORCH_API ProfilerConfig getProfilerConfig(); // Writes profiled events to a stream. TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector& events); -#ifdef USE_KINETO -struct TORCH_API KinetoEvent { - uint64_t startThreadId() const { - return start_thread_id_; - } - - uint64_t endThreadId() const { - return end_thread_id_; - } - - c10::DeviceType deviceType() const { - return device_type_; - } - - uint64_t fwdThreadId() const { - return fwd_thread_id_; - } - - bool hasShapes() const { - return shapes_ != c10::nullopt; - } - - const std::vector>& shapes() const { - return *shapes_; - } - - int64_t sequenceNr() const { - return sequence_nr_; - } - - bool hasStack() const { - return stack_ != c10::nullopt; - } - - const std::vector& stack() const { - return *stack_; - } - - uint8_t scope() const { - return scope_; - } - - KinetoEvent& startThreadId(uint64_t start_thread_id) { - start_thread_id_ = start_thread_id; - return *this; - } - - KinetoEvent& endThreadId(uint64_t end_thread_id) { - end_thread_id_ = end_thread_id; - return *this; - } - - KinetoEvent& deviceType(c10::DeviceType device_type) { - device_type_ = device_type; - return *this; - } - - KinetoEvent& fwdThreadId(uint64_t fwd_thread_id) { - fwd_thread_id_ = fwd_thread_id; - return *this; - } - - KinetoEvent& shapes(const std::vector>& shapes) { - *shapes_ = shapes; - return *this; - } - - KinetoEvent& sequenceNr(int64_t sequence_nr) { - sequence_nr_ = sequence_nr_; - return *this; - } - - KinetoEvent& stack(const std::vector& st) { - *stack_ = st; - return *this; - } - - KinetoEvent& scope(uint8_t scope) { - scope_ = scope; - return *this; - } - - // Kineto fields - - KinetoEvent& activity(const libkineto::TraceActivity& activity); - - std::string name() const { - return name_; - } - - uint64_t deviceIndex() const { - return device_index_; - } - - uint64_t startUs() const { - return start_us_; - } - - uint64_t durationUs() const { - return duration_us_; - } - - uint64_t correlationId() const { - return correlation_id_; - } - - KinetoEvent& correlationId(uint64_t correlation_id) { - correlation_id_ = correlation_id; - return *this; - } - - private: - uint64_t start_thread_id_ = 0; - uint64_t end_thread_id_ = 0; - uint64_t fwd_thread_id_ = 0; - int64_t sequence_nr_ = 0; - uint8_t scope_ = 0; - - c10::DeviceType device_type_ = c10::DeviceType::CPU; - c10::optional>> shapes_; - c10::optional> stack_; - - std::string name_; - uint64_t device_index_ = 0; - uint64_t start_us_ = 0; - uint64_t duration_us_ = 0; - uint64_t correlation_id_ = 0; -}; - -struct TORCH_API ProfilerResult { - ProfilerResult( - const std::vector>& events, - const thread_event_lists& legacy_events) - : events_(events), legacy_events_(legacy_events) {} - - const std::vector> events() const { - return events_; - } - - const thread_event_lists& legacy_events() const { - return legacy_events_; - } - - private: - std::vector> events_; - thread_event_lists legacy_events_; // tensor mem alloc, start/stop -}; -TORCH_API void enableProfiler( - const ProfilerConfig& config, - const std::set& activities); -TORCH_API ProfilerResult disableProfiler(); - -TORCH_API void prepareProfiler( - const ProfilerConfig& config, - const std::set& activities); -#endif // USE_KINETO - -TORCH_API bool kinetoAvailable(); - // Usage: // { // RecordProfile guard("filename.trace"); @@ -638,5 +467,65 @@ struct TORCH_API TLSProfilerGuard { const c10::optional profilerDisableOptions_; }; +struct FileLineFunc { + std::string filename; + size_t line; + std::string funcname; +}; +std::vector prepareCallstack(const std::vector& cs); +std::vector callstackStr(const std::vector& cs); +std::vector> inputSizes(const at::RecordFunction& fn); + +struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { + explicit ProfilerThreadLocalState(const ProfilerConfig& config) + : config_(config), remoteProfiledEvents_{c10::nullopt} {} + ~ProfilerThreadLocalState() override = default; + + inline const ProfilerConfig& config() const; + + thread_event_lists consolidate(); + + void mark(std::string name, bool include_cuda = true); + + void setOrAddRemoteProfiledEvents( + std::vector&& remoteProfiledEvents); + + void pushRange( + const at::RecordFunction& fn, + const bool record_cuda, + const char* msg = "", + std::vector>&& shapes = {}); + + void popRange(const at::RecordFunction& fn, const bool record_cuda); + + void setCallbackHandle(at::CallbackHandle handle); + + at::CallbackHandle callbackHandle() const; + + void reportMemoryUsage( + void* /* unused */, + int64_t alloc_size, + c10::Device device) override; + + bool memoryProfilingEnabled() const override; + private: + std::string getNvtxStr( + const at::StringView& name, + const char* msg, + int64_t sequence_nr, + const std::vector>& shapes) const; + + RangeEventList& getEventList(int64_t thread_id = -1); + + std::mutex state_mutex_; + std::unordered_map> + event_lists_map_; + + ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled); + at::CallbackHandle handle_ = 0; + c10::optional>> remoteProfiledEvents_; +}; + + } // namespace profiler }} // namespace torch::autograd diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp new file mode 100644 index 000000000000..171cea0d3496 --- /dev/null +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -0,0 +1,206 @@ +#include + +#ifdef USE_KINETO +#include "libkineto.h" + +namespace torch { namespace autograd { namespace profiler { + +namespace { +// TODO: TLS +std::atomic corr_id_ {1}; +uint64_t next_correlation_id() { + return corr_id_++; +} + +struct KinetoObserverContext : public at::ObserverContext { + int64_t startUs; + uint64_t correlationId; + uint64_t startThreadId; + uint64_t endThreadId; + c10::optional>> shapes; + int64_t sequenceNr; + uint64_t fwdThreadId; + uint8_t recFunScope; + c10::optional> stack; +}; + +void reportKinetoClientActivity( + const at::RecordFunction& fn, + const KinetoObserverContext& ctx) { + /*TORCH_CHECK((config_.state == ProfilerState::KINETO, + "Supported only in Kineto profiler"); + op.startTime = ctx.startUs; + op.endTime = (getTime() / 1000); + op.opType = std::string(fn.name().str()); + op.device = 0; // CPU + op.correlation = ctx.correlationId; + if (ctx.shapes && !ctx.shapes->empty()) { + //op.inputDims = toStr(*ctx.shapes); // + } + //op.threadId = pthread_self(); + + { + std::lock_guard guard(state_mutex_); + kineto_client_activities_.emplace_back(std::move(op)); + kineto_events_.emplace_back(); + kineto_events_.back() + .startThreadId(ctx.startThreadId) + .endThreadId(ctx.endThreadId) + .sequenceNr(ctx.sequenceNr) + .fwdThreadId(ctx.fwdThreadId) + .scope(ctx.recFunScope); + if (ctx.stack && !ctx.stack->empty()) { + kineto_events_.back().stack(*ctx.stack); + } + }*/ +} + +void pushProfilingCallbacks() { + auto state_ptr = getProfilerTLSState(); + TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set"); + auto handle = at::addThreadLocalCallback(at::RecordFunctionCallback( + [](const at::RecordFunction& fn) { + auto state_ptr = getProfilerTLSState(); + if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) { + return std::make_unique(); + } + + auto corr_id = next_correlation_id(); + libkineto::api().pushCorrelationId(corr_id); + + auto ctx_ptr = std::make_unique(); + ctx_ptr->startUs = getTime() / 1000; + ctx_ptr->correlationId = corr_id; + ctx_ptr->startThreadId = at::RecordFunction::currentThreadId(); + + if (state_ptr->config().report_input_shapes) { + ctx_ptr->shapes = inputSizes(fn); + } + + ctx_ptr->sequenceNr = fn.seqNr(); + ctx_ptr->fwdThreadId = fn.forwardThreadId(); + ctx_ptr->recFunScope = (uint8_t)fn.scope(); + +#ifndef C10_MOBILE + // backward nodes source range corresponds to the forward node + // TODO: consider using C++ stack trace + if (state_ptr->config().with_stack && + fn.scope() != at::RecordScope::BACKWARD_FUNCTION) { + auto cs = prepareCallstack(jit::currentCallstack()); + if (cs.empty()) { + cs = prepareCallstack(jit::tracer::pythonCallstack()); + } + ctx_ptr->stack = callstackStr(cs); + } +#endif + return ctx_ptr; + }, + [](const at::RecordFunction& fn, at::ObserverContext* ctx_ptr) { + auto state_ptr = getProfilerTLSState(); + if (!state_ptr || state_ptr->config().state != ProfilerState::KINETO) { + return; + } + auto kineto_ctx_ptr = dynamic_cast(ctx_ptr); + TORCH_INTERNAL_ASSERT(kineto_ctx_ptr != nullptr); + + kineto_ctx_ptr->endThreadId = at::RecordFunction::currentThreadId(); + + state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr); + libkineto::api().popCorrelationId(); + }) + .needsInputs(state_ptr->config().report_input_shapes) + .needsIds(true)); + state_ptr->setCallbackHandle(handle); +} + +} // namespace + +void prepareProfiler( + const ProfilerConfig& config, + const std::set& activities) { + TORCH_CHECK(config.state == ProfilerState::KINETO, + "Supported only in Kineto profiler"); + + std::set k_activities; + if (activities.count(ActivityType::CPU)) { + k_activities.insert(libkineto::ActivityType::EXTERNAL_CORRELATION); + k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME); + } + //if (activities.count(ActivityType::CUDA_RUNTIME)) { + // k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME); + //} + if (activities.count(ActivityType::CUDA)) { + k_activities.insert(libkineto::ActivityType::GPU_MEMCPY); + k_activities.insert(libkineto::ActivityType::GPU_MEMSET); + k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL); + } + + //if (!libkineto::api().hasProfilerRegistered()) { + // libkineto::api().registerProfiler( + // std::make_unique(false)); + //} + //libkineto::api().initProfilerIfRegistered(); + libkineto::api().prepareTrace(k_activities); +} + +void enableProfiler( + const ProfilerConfig& config, + const std::set& activities) { + TORCH_CHECK(config.state == ProfilerState::KINETO); + TORCH_CHECK(!activities.empty(), "No activities specified for Kineto profiler"); + + auto state_ptr = getProfilerTLSState(); + TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread"); + auto state = std::make_shared(config); + c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); + + if (activities.count(ActivityType::CPU)) { + pushProfilingCallbacks(); + } + + if (!libkineto::api().traceActive()) { + libkineto::api().startTrace(); + } + + state->mark("__start_profile", false); +} + +ProfilerResult disableProfiler() { + // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard + auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); + + auto state_ptr = static_cast(state.get()); + TORCH_CHECK(state_ptr && state_ptr->config().state == ProfilerState::KINETO, + "Can't disable Kineto profiler when it's not running"); + + if (state_ptr->callbackHandle() > 0) { + at::removeCallback(state_ptr->callbackHandle()); + } + + state_ptr->mark("__stop_profile"); + + //auto trace = std::move(libkineto::api().stopTrace()); + libkineto::api().stopTrace(); + std::vector> kineto_events; // = filterTrace(trace); + auto legacy_events = state_ptr->consolidate(); + return ProfilerResult(kineto_events, legacy_events); +} + +KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) { + name_ = activity.name(); + device_index_ = activity.deviceId(); + start_us_ = activity.timestamp(); + duration_us_ = activity.duration(); + correlation_id_ = activity.correlationId(); + return *this; +} + +bool kinetoAvailable() { +#ifdef USE_KINETO + return true; +#else + return false; +#endif +} + +}}} diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h new file mode 100644 index 000000000000..671a2dc5cc0e --- /dev/null +++ b/torch/csrc/autograd/profiler_kineto.h @@ -0,0 +1,186 @@ +#pragma once + +#include + +#include + +#ifdef USE_KINETO +namespace libkineto { +class TraceActivity; +} +#endif + +namespace torch { +namespace autograd { +namespace profiler { + +enum class C10_API_ENUM ActivityType { + CPU = 0, + // CUDA_RUNTIME, // CUDA host events + CUDA, // CUDA kernels + NUM_KINETO_ACTIVITIES, // must be the last one +}; + +#ifdef USE_KINETO +struct TORCH_API KinetoEvent { + uint64_t startThreadId() const { + return start_thread_id_; + } + + uint64_t endThreadId() const { + return end_thread_id_; + } + + c10::DeviceType deviceType() const { + return device_type_; + } + + uint64_t fwdThreadId() const { + return fwd_thread_id_; + } + + bool hasShapes() const { + return shapes_ != c10::nullopt; + } + + const std::vector>& shapes() const { + return *shapes_; + } + + int64_t sequenceNr() const { + return sequence_nr_; + } + + bool hasStack() const { + return stack_ != c10::nullopt; + } + + const std::vector& stack() const { + return *stack_; + } + + uint8_t scope() const { + return scope_; + } + + KinetoEvent& startThreadId(uint64_t start_thread_id) { + start_thread_id_ = start_thread_id; + return *this; + } + + KinetoEvent& endThreadId(uint64_t end_thread_id) { + end_thread_id_ = end_thread_id; + return *this; + } + + KinetoEvent& deviceType(c10::DeviceType device_type) { + device_type_ = device_type; + return *this; + } + + KinetoEvent& fwdThreadId(uint64_t fwd_thread_id) { + fwd_thread_id_ = fwd_thread_id; + return *this; + } + + KinetoEvent& shapes(const std::vector>& shapes) { + *shapes_ = shapes; + return *this; + } + + KinetoEvent& sequenceNr(int64_t sequence_nr) { + sequence_nr_ = sequence_nr_; + return *this; + } + + KinetoEvent& stack(const std::vector& st) { + *stack_ = st; + return *this; + } + + KinetoEvent& scope(uint8_t scope) { + scope_ = scope; + return *this; + } + + // Kineto fields + + KinetoEvent& activity(const libkineto::TraceActivity& activity); + + std::string name() const { + return name_; + } + + uint64_t deviceIndex() const { + return device_index_; + } + + uint64_t startUs() const { + return start_us_; + } + + uint64_t durationUs() const { + return duration_us_; + } + + uint64_t correlationId() const { + return correlation_id_; + } + + KinetoEvent& correlationId(uint64_t correlation_id) { + correlation_id_ = correlation_id; + return *this; + } + + private: + uint64_t start_thread_id_ = 0; + uint64_t end_thread_id_ = 0; + uint64_t fwd_thread_id_ = 0; + int64_t sequence_nr_ = 0; + uint8_t scope_ = 0; + + c10::DeviceType device_type_ = c10::DeviceType::CPU; + c10::optional>> shapes_; + c10::optional> stack_; + + std::string name_; + uint64_t device_index_ = 0; + uint64_t start_us_ = 0; + uint64_t duration_us_ = 0; + uint64_t correlation_id_ = 0; +}; + +struct TORCH_API ProfilerResult { + ProfilerResult( + const std::vector>& events, + const thread_event_lists& legacy_events) + : events_(events), legacy_events_(legacy_events) {} + + const std::vector> events() const { + return events_; + } + + const thread_event_lists& legacy_events() const { + return legacy_events_; + } + + private: + std::vector> events_; + thread_event_lists legacy_events_; // tensor mem alloc, start/stop +}; + +TORCH_API void enableProfiler( + const ProfilerConfig& config, + const std::set& activities); + +TORCH_API ProfilerResult disableProfiler(); + +TORCH_API void prepareProfiler( + const ProfilerConfig& config, + const std::set& activities); +#endif // USE_KINETO + +TORCH_API bool kinetoAvailable(); + +} // namespace profiler +}} // namespace torch::autograd From 5761ea2617caec4f7a0181827e61b156ed99fbf6 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 09:07:37 -0800 Subject: [PATCH 26/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- tools/build_variables.bzl | 1 + torch/csrc/autograd/profiler.h | 15 ++++++++++----- torch/csrc/autograd/profiler_kineto.cpp | 11 ++++++++++- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index 65f5ec1c6903..fd04c1326263 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -75,6 +75,7 @@ jit_core_sources = [ core_sources_common = [ "torch/csrc/autograd/profiler.cpp", + "torch/csrc/autograd/profiler_kineto.cpp", "torch/csrc/jit/frontend/edit_distance.cpp", "torch/csrc/jit/frontend/string_to_type.cpp", "torch/csrc/jit/mobile/type_parser.cpp", diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index b2468b158694..ca0a42f823a1 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -467,16 +467,16 @@ struct TORCH_API TLSProfilerGuard { const c10::optional profilerDisableOptions_; }; -struct FileLineFunc { +struct TORCH_API FileLineFunc { std::string filename; size_t line; std::string funcname; }; -std::vector prepareCallstack(const std::vector& cs); -std::vector callstackStr(const std::vector& cs); -std::vector> inputSizes(const at::RecordFunction& fn); +TORCH_API std::vector prepareCallstack(const std::vector& cs); +TORCH_API std::vector callstackStr(const std::vector& cs); +TORCH_API std::vector> inputSizes(const at::RecordFunction& fn); -struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { +struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { explicit ProfilerThreadLocalState(const ProfilerConfig& config) : config_(config), remoteProfiledEvents_{c10::nullopt} {} ~ProfilerThreadLocalState() override = default; @@ -508,6 +508,11 @@ struct ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { c10::Device device) override; bool memoryProfilingEnabled() const override; + + virtual void reportClientActivity( + const at::RecordFunction& fn, + const at::ObserverContext& ctx) {} + private: std::string getNvtxStr( const at::StringView& name, diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 171cea0d3496..244f3a851340 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -1,5 +1,8 @@ #include +#include +#include + #ifdef USE_KINETO #include "libkineto.h" @@ -55,6 +58,11 @@ void reportKinetoClientActivity( }*/ } +ProfilerThreadLocalState* getProfilerTLSState() { + const auto& state = c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE); + return dynamic_cast(state.get()); +} + void pushProfilingCallbacks() { auto state_ptr = getProfilerTLSState(); TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set"); @@ -105,7 +113,7 @@ void pushProfilingCallbacks() { kineto_ctx_ptr->endThreadId = at::RecordFunction::currentThreadId(); - state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr); + //state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr); libkineto::api().popCorrelationId(); }) .needsInputs(state_ptr->config().report_input_shapes) @@ -194,6 +202,7 @@ KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) { correlation_id_ = activity.correlationId(); return *this; } +#endif bool kinetoAvailable() { #ifdef USE_KINETO From dde5ec3c7684893530774ec04469077a829ec7d5 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 09:20:11 -0800 Subject: [PATCH 27/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.cpp | 38 ++++++++++++++++---------------- torch/csrc/autograd/profiler.h | 2 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler.cpp index 6eb6b37d11f6..2fe3bdb451cc 100644 --- a/torch/csrc/autograd/profiler.cpp +++ b/torch/csrc/autograd/profiler.cpp @@ -151,7 +151,7 @@ static CUDAStubs* cuda_stubs = default_stubs_addr; } // Profiler state -inline const ProfilerConfig& ProfilerThreadLocalState::config() const { +const ProfilerConfig& ProfilerThreadLocalState::config() const { return config_; } @@ -349,6 +349,24 @@ RangeEventList& ProfilerThreadLocalState::getEventList(int64_t thread_id) { return *list_ptr; } +std::vector> inputSizes(const at::RecordFunction& fn) { + std::vector> sizes; + sizes.reserve(fn.inputs().size()); + for (const c10::IValue& input : fn.inputs()) { + if (!input.isTensor()) { + sizes.emplace_back(); + continue; + } + const at::Tensor& tensor = input.toTensor(); + if (tensor.defined()) { + sizes.push_back(input.toTensor().sizes().vec()); + } else { + sizes.emplace_back(); + } + } + return sizes; +} + namespace { enum EventIValueIdx { @@ -397,24 +415,6 @@ ProfilerThreadLocalState* getProfilerTLSState() { return dynamic_cast(state.get()); } -std::vector> inputSizes(const at::RecordFunction& fn) { - std::vector> sizes; - sizes.reserve(fn.inputs().size()); - for (const c10::IValue& input : fn.inputs()) { - if (!input.isTensor()) { - sizes.emplace_back(); - continue; - } - const at::Tensor& tensor = input.toTensor(); - if (tensor.defined()) { - sizes.push_back(input.toTensor().sizes().vec()); - } else { - sizes.emplace_back(); - } - } - return sizes; -} - void pushProfilingCallbacksLegacy() { auto state_ptr = getProfilerTLSState(); TORCH_INTERNAL_ASSERT(state_ptr, "Expected profiler state set"); diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index ca0a42f823a1..1c0694e3c8a9 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -481,7 +481,7 @@ struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase : config_(config), remoteProfiledEvents_{c10::nullopt} {} ~ProfilerThreadLocalState() override = default; - inline const ProfilerConfig& config() const; + const ProfilerConfig& config() const; thread_event_lists consolidate(); From 3a25bd2a0a81cb08dc327789cab1639042354a7e Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 09:50:54 -0800 Subject: [PATCH 28/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.h | 4 +- torch/csrc/autograd/profiler_kineto.cpp | 83 ++++++++++++------------- torch/csrc/autograd/profiler_kineto.h | 13 ++++ 3 files changed, 56 insertions(+), 44 deletions(-) diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 1c0694e3c8a9..657a990019fa 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -511,9 +511,9 @@ struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase virtual void reportClientActivity( const at::RecordFunction& fn, - const at::ObserverContext& ctx) {} + const at::ObserverContext* ctx) {} - private: + protected: std::string getNvtxStr( const at::StringView& name, const char* msg, diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 244f3a851340..b6d1acc49c92 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -15,48 +15,47 @@ uint64_t next_correlation_id() { return corr_id_++; } -struct KinetoObserverContext : public at::ObserverContext { - int64_t startUs; - uint64_t correlationId; - uint64_t startThreadId; - uint64_t endThreadId; - c10::optional>> shapes; - int64_t sequenceNr; - uint64_t fwdThreadId; - uint8_t recFunScope; - c10::optional> stack; -}; - -void reportKinetoClientActivity( - const at::RecordFunction& fn, - const KinetoObserverContext& ctx) { - /*TORCH_CHECK((config_.state == ProfilerState::KINETO, - "Supported only in Kineto profiler"); - op.startTime = ctx.startUs; - op.endTime = (getTime() / 1000); - op.opType = std::string(fn.name().str()); - op.device = 0; // CPU - op.correlation = ctx.correlationId; - if (ctx.shapes && !ctx.shapes->empty()) { - //op.inputDims = toStr(*ctx.shapes); // - } - //op.threadId = pthread_self(); - - { - std::lock_guard guard(state_mutex_); - kineto_client_activities_.emplace_back(std::move(op)); - kineto_events_.emplace_back(); - kineto_events_.back() - .startThreadId(ctx.startThreadId) - .endThreadId(ctx.endThreadId) - .sequenceNr(ctx.sequenceNr) - .fwdThreadId(ctx.fwdThreadId) - .scope(ctx.recFunScope); - if (ctx.stack && !ctx.stack->empty()) { - kineto_events_.back().stack(*ctx.stack); +struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { + using ProfilerThreadLocalState::ProfilerThreadLocalState; + virtual ~KinetoThreadLocalState() override = default; + + virtual void reportClientActivity( + const at::RecordFunction& fn, + const at::ObserverContext* observer_ctx) override { + auto ctx = dynamic_cast(observer_ctx); + TORCH_CHECK(ctx); + TORCH_CHECK(config_.state == ProfilerState::KINETO, + "Supported only in Kineto profiler"); + libkineto::ClientTraceActivity op; + op.startTime = ctx->startUs; + op.endTime = (getTime() / 1000); + op.opType = std::string(fn.name().str()); + op.device = 0; // CPU + op.correlation = ctx->correlationId; + if (ctx->shapes && !ctx->shapes->empty()) { + //op.inputDims = toStr(*ctx->shapes); // } - }*/ -} + //op.threadId = pthread_self(); + + { + std::lock_guard guard(state_mutex_); + kineto_client_activities_.emplace_back(std::move(op)); + kineto_events_.emplace_back(); + kineto_events_.back() + .startThreadId(ctx->startThreadId) + .endThreadId(ctx->endThreadId) + .sequenceNr(ctx->sequenceNr) + .fwdThreadId(ctx->fwdThreadId) + .scope(ctx->recFunScope); + if (ctx->stack && !ctx->stack->empty()) { + kineto_events_.back().stack(*ctx->stack); + } + } + } + + std::vector kineto_client_activities_; + std::vector kineto_events_; +}; ProfilerThreadLocalState* getProfilerTLSState() { const auto& state = c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE); @@ -113,7 +112,7 @@ void pushProfilingCallbacks() { kineto_ctx_ptr->endThreadId = at::RecordFunction::currentThreadId(); - //state_ptr->reportKinetoClientActivity(fn, *kineto_ctx_ptr); + state_ptr->reportClientActivity(fn, kineto_ctx_ptr); libkineto::api().popCorrelationId(); }) .needsInputs(state_ptr->config().report_input_shapes) diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index 671a2dc5cc0e..090069ea00ee 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -22,6 +22,19 @@ enum class C10_API_ENUM ActivityType { }; #ifdef USE_KINETO + +struct KinetoObserverContext : public at::ObserverContext { + int64_t startUs; + uint64_t correlationId; + uint64_t startThreadId; + uint64_t endThreadId; + c10::optional>> shapes; + int64_t sequenceNr; + uint64_t fwdThreadId; + uint8_t recFunScope; + c10::optional> stack; +}; + struct TORCH_API KinetoEvent { uint64_t startThreadId() const { return start_thread_id_; From 6023998d80e2d6c13fb674614aaf6c8ab41e0479 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 09:54:08 -0800 Subject: [PATCH 29/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler_kineto.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index b6d1acc49c92..1c8993218164 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -57,9 +57,9 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { std::vector kineto_events_; }; -ProfilerThreadLocalState* getProfilerTLSState() { +KinetoThreadLocalState* getProfilerTLSState() { const auto& state = c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE); - return dynamic_cast(state.get()); + return dynamic_cast(state.get()); } void pushProfilingCallbacks() { @@ -158,7 +158,7 @@ void enableProfiler( auto state_ptr = getProfilerTLSState(); TORCH_CHECK(!state_ptr, "Profiler is already enabled on this thread"); - auto state = std::make_shared(config); + auto state = std::make_shared(config); c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); if (activities.count(ActivityType::CPU)) { @@ -176,7 +176,7 @@ ProfilerResult disableProfiler() { // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); - auto state_ptr = static_cast(state.get()); + auto state_ptr = static_cast(state.get()); TORCH_CHECK(state_ptr && state_ptr->config().state == ProfilerState::KINETO, "Can't disable Kineto profiler when it's not running"); From 0bc66a67af9771e4fbd55ccabdd9f7c0cde489c6 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 10:08:16 -0800 Subject: [PATCH 30/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler_kineto.cpp | 11 +++++++++++ torch/csrc/autograd/profiler_kineto.h | 1 - 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 1c8993218164..74eba5bcd272 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -55,6 +55,8 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { std::vector kineto_client_activities_; std::vector kineto_events_; + + std::unique_ptr cpu_trace; }; KinetoThreadLocalState* getProfilerTLSState() { @@ -161,6 +163,11 @@ void enableProfiler( auto state = std::make_shared(config); c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); + state->cpu_trace = std::make_unique(); + state->cpu_trace->span.startTime = getTime() / 1000; + state->cpu_trace->gpuOpCount = -1; + state->cpu_trace->span.name = "PyTorch Profiler"; + if (activities.count(ActivityType::CPU)) { pushProfilingCallbacks(); } @@ -186,6 +193,10 @@ ProfilerResult disableProfiler() { state_ptr->mark("__stop_profile"); + state_ptr->cpu_trace->span.endTime = getTime() / 1000; + + libkineto::api().transferCpuTrace(std::move(state_ptr->cpu_trace)); + //auto trace = std::move(libkineto::api().stopTrace()); libkineto::api().stopTrace(); std::vector> kineto_events; // = filterTrace(trace); diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index 090069ea00ee..4f429afb72c3 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -145,7 +145,6 @@ struct TORCH_API KinetoEvent { return *this; } - private: uint64_t start_thread_id_ = 0; uint64_t end_thread_id_ = 0; uint64_t fwd_thread_id_ = 0; From aa2d09e753953777feda2a8100602c87e3c4d668 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 10:28:13 -0800 Subject: [PATCH 31/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler_kineto.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 74eba5bcd272..f1108247b27d 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -39,7 +39,7 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { { std::lock_guard guard(state_mutex_); - kineto_client_activities_.emplace_back(std::move(op)); + cpu_trace->ops.emplace_back(std::move(op)); kineto_events_.emplace_back(); kineto_events_.back() .startThreadId(ctx->startThreadId) @@ -53,7 +53,6 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { } } - std::vector kineto_client_activities_; std::vector kineto_events_; std::unique_ptr cpu_trace; @@ -197,7 +196,7 @@ ProfilerResult disableProfiler() { libkineto::api().transferCpuTrace(std::move(state_ptr->cpu_trace)); - //auto trace = std::move(libkineto::api().stopTrace()); + //auto trace = libkineto::api().stopTrace(); libkineto::api().stopTrace(); std::vector> kineto_events; // = filterTrace(trace); auto legacy_events = state_ptr->consolidate(); From 1556a7c21ebd795e323352805534ff7af76f170e Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 12:22:01 -0800 Subject: [PATCH 32/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- tools/build_variables.bzl | 2 +- torch/csrc/autograd/{profiler.cpp => profiler_legacy.cpp} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename torch/csrc/autograd/{profiler.cpp => profiler_legacy.cpp} (100%) diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl index fd04c1326263..08a57d0549a9 100644 --- a/tools/build_variables.bzl +++ b/tools/build_variables.bzl @@ -74,7 +74,7 @@ jit_core_sources = [ # list for the shared files. core_sources_common = [ - "torch/csrc/autograd/profiler.cpp", + "torch/csrc/autograd/profiler_legacy.cpp", "torch/csrc/autograd/profiler_kineto.cpp", "torch/csrc/jit/frontend/edit_distance.cpp", "torch/csrc/jit/frontend/string_to_type.cpp", diff --git a/torch/csrc/autograd/profiler.cpp b/torch/csrc/autograd/profiler_legacy.cpp similarity index 100% rename from torch/csrc/autograd/profiler.cpp rename to torch/csrc/autograd/profiler_legacy.cpp From 4a0fec9d9a96c607ad54eaf80c63a15be3ff7071 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 12:33:47 -0800 Subject: [PATCH 33/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler.h | 536 +------------------------- torch/csrc/autograd/profiler_kineto.h | 2 +- torch/csrc/autograd/profiler_legacy.h | 536 ++++++++++++++++++++++++++ 3 files changed, 539 insertions(+), 535 deletions(-) create mode 100644 torch/csrc/autograd/profiler_legacy.h diff --git a/torch/csrc/autograd/profiler.h b/torch/csrc/autograd/profiler.h index 657a990019fa..7ac44096cda7 100644 --- a/torch/csrc/autograd/profiler.h +++ b/torch/csrc/autograd/profiler.h @@ -1,536 +1,4 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifndef _WIN32 -#include -#endif -#if defined(C10_IOS) && defined(C10_MOBILE) -#include // for gettimeofday() -#endif - -#include - -#include - -struct CUevent_st; -typedef std::shared_ptr CUDAEventStub; - -namespace torch { namespace autograd { - -struct Node; - -namespace profiler { - -struct TORCH_API CUDAStubs { - virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) { - fail(); - } - virtual float elapsed(const CUDAEventStub* event, const CUDAEventStub* event2) { - fail(); - return 0.f; - } - virtual void nvtxMarkA(const char* name) { - fail(); - } - virtual void nvtxRangePushA(const char* name) { - fail(); - } - virtual void nvtxRangePop() { - fail(); - } - virtual bool enabled() { - return false; - } - virtual void onEachDevice(std::function op) { - fail(); - } - virtual void synchronize() { - fail(); - } - virtual ~CUDAStubs(); - -private: - void fail() { - AT_ERROR("CUDA used in profiler but not enabled."); - } -}; - -TORCH_API void registerCUDAMethods(CUDAStubs* stubs); - -constexpr inline size_t ceilToMultiple(size_t a, size_t b) { - return ((a + b - 1) / b) * b; -} - -inline int64_t getTime() { -#if defined(C10_IOS) && defined(C10_MOBILE) -// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS can't rely on -// CLOCK_REALTIME, as it is defined no matter if clock_gettime is implemented or not - struct timeval now; - gettimeofday(&now, NULL); - return static_cast(now.tv_sec) * 1000000000 + static_cast(now.tv_usec) * 1000; -#elif defined(_WIN32) || defined(__MACH__) - using namespace std::chrono; - using clock = std::conditional::type; - return duration_cast(clock::now().time_since_epoch()).count(); -#else - // clock_gettime is *much* faster than std::chrono implementation on Linux - struct timespec t{}; - clock_gettime(CLOCK_MONOTONIC, &t); - return static_cast(t.tv_sec) * 1000000000 + static_cast(t.tv_nsec); -#endif -} - -enum class C10_API_ENUM EventKind : uint16_t { - Mark, - PushRange, - PopRange, - MemoryAlloc, -}; - -// To be deprecated, once we switch to Kineto profiling -struct TORCH_API LegacyEvent { - LegacyEvent( - EventKind kind, - at::StringView name, - uint16_t thread_id, - bool record_cuda, - at::RecordFunctionHandle handle = 0, - std::vector>&& shapes = {}, - int node_id = -1) - : name_(std::move(name)), - kind_(kind), - thread_id_(thread_id), - handle_(handle), - shapes_(shapes), - node_id_(node_id) { - record(record_cuda); - } - - // Constructor to be used in conjunction with LegacyEvent::fromIValue. - LegacyEvent( - EventKind kind, - at::StringView name, - uint16_t thread_id, - at::RecordFunctionHandle handle, - std::vector>&& shapes, - int node_id, - bool is_remote, - int64_t cpu_memory_usage, - int64_t cpu_ns, - bool cuda_recorded, - int64_t cuda_memory_usage = 0, - int device = -1, - double cuda_us = -1) - : cpu_ns_(cpu_ns), - name_(std::move(name)), - kind_(kind), - thread_id_(thread_id), - handle_(handle), - shapes_(shapes), - cpu_memory_usage_(cpu_memory_usage), - cuda_memory_usage_(cuda_memory_usage), - device_(device), - node_id_(node_id), - is_remote_(is_remote), - cuda_us_(cuda_us) { - // Sanity check values that were deserialized - TORCH_INTERNAL_ASSERT(cpu_ns_ > 0); - if (cuda_recorded) { - TORCH_INTERNAL_ASSERT(device_ >= 0); - TORCH_INTERNAL_ASSERT(cuda_us_ >= 0); - } - } - - // Returns IValues corresponding to event structure, to be used for - // serialization. - at::IValue toIValue() const; - - // Reconstructs an event from IValues given by toIValue. - static LegacyEvent fromIValue(const at::IValue& eventIValue); - - void record(bool record_cuda); - - std::string kindStr() const { - switch (kind_) { - case EventKind::Mark: return "mark"; - case EventKind::PushRange: return "push"; - case EventKind::PopRange: return "pop"; - case EventKind::MemoryAlloc: return "memory_alloc"; - } - throw std::runtime_error("unknown event kind"); - } - - const char* name() const { - return name_.str(); - } - - uint64_t threadId() const { - return thread_id_; - } - - std::vector> shapes() const { - return shapes_; - } - - double cpuElapsedUs(const LegacyEvent& e) const { - return (e.cpu_ns_ - cpu_ns_)/(1000.0); - } - - double cpuUs() const { - return cpu_ns_ / (1000.0); - } - - double cudaElapsedUs(const LegacyEvent& e) const; - - bool hasCuda() const { - return cuda_event != nullptr || (isRemote() && device_ != -1); - } - - int device() const { - return device_; - } - - void updateMemoryStats(int64_t alloc_size, c10::Device device) { - if (device.type() == c10::DeviceType::CUDA || - device.type() == c10::DeviceType::HIP) { - cuda_memory_usage_ = alloc_size; - } else if (device.type() == c10::DeviceType::CPU || - device.type() == c10::DeviceType::MKLDNN || - device.type() == c10::DeviceType::IDEEP) { - cpu_memory_usage_ = alloc_size; - } else { - LOG(WARNING) << "Unsupported memory profiling device: " << device; - } - } - - int64_t cpuMemoryUsage() const { - return cpu_memory_usage_; - } - - int64_t cudaMemoryUsage() const { - return cuda_memory_usage_; - } - - at::RecordFunctionHandle handle() const { - return handle_; - } - - // Node ID corresponding to this event. - int nodeId( ) const { - return node_id_; - } - - // Set Node ID on this event. - void setNodeId(int node_id) { - node_id_ = node_id; - } - - void setName(at::StringView newName_) { - name_ = std::move(newName_); - } - - bool isRemote() const { - return is_remote_; - } - - void setCudaUs(int64_t cuda_us) { - cuda_us_ = cuda_us; - } - - void setSequenceNr(int64_t sequence_nr) { - sequence_nr_ = sequence_nr; - } - - int64_t sequenceNr() const { - return sequence_nr_; - } - - void setCorrelationId(uint64_t correlation_id) { - correlation_id_ = correlation_id; - } - - uint64_t correlationId() const { - return correlation_id_; - } - - const std::vector& stack() const { - return stack_; - } - - void setStack(const std::vector& stack) { - stack_ = stack; - } - - uint64_t fwdThreadId() const { - return fwd_thread_id_; - } - - void setFwdThreadId(uint64_t fwd_thread_id) { - fwd_thread_id_ = fwd_thread_id; - } - - uint8_t scope() const { - return scope_; - } - - void setScope(uint8_t scope) { - scope_ = scope; - } - - private: - // signed to allow for negative intervals, initialized for safety. - int64_t cpu_ns_ = 0; - at::StringView name_; - EventKind kind_; - uint64_t thread_id_; - uint64_t fwd_thread_id_; - at::RecordFunctionHandle handle_ {0}; - std::vector> shapes_; - int64_t cpu_memory_usage_ = 0; - int64_t cuda_memory_usage_ = 0; - int device_ = -1; - CUDAEventStub cuda_event = nullptr; - int node_id_ = 0; - bool is_remote_ = false; - int64_t cuda_us_ = -1; - int64_t sequence_nr_ = -1; - - std::vector stack_; - uint8_t scope_; - uint64_t correlation_id_; -}; - -// a linked-list of fixed sized vectors, to avoid -// a std::vector resize from taking a large amount of time inside -// a profiling event -struct RangeEventList { - RangeEventList() { - events_.reserve(kReservedCapacity); - } - - template - void record(Args&&... args) { - std::lock_guard guard(mutex_); - events_.emplace_back(std::forward(args)...); - } - - std::vector consolidate() { - std::lock_guard lock(mutex_); - std::vector result; - result.insert( - result.begin(), - std::make_move_iterator(events_.begin()), - std::make_move_iterator(events_.end())); - events_.erase(events_.begin(), events_.end()); - return result; - } - - size_t size() { - std::lock_guard lock(mutex_); - return events_.size(); - } - - private: - // This mutex is used to serialize access when different threads are writing - // to the same instance of RangeEventList. - std::mutex mutex_; - std::vector events_; - - static const size_t kReservedCapacity = 1024; -}; - -enum class C10_API_ENUM ProfilerState { - Disabled = 0, - CPU, // CPU-only profiling - CUDA, // CPU + CUDA events - NVTX, // only emit NVTX markers - KINETO, // use libkineto - NUM_PROFILER_STATES, // must be the last one -}; - -struct TORCH_API ProfilerConfig { - ProfilerConfig( - ProfilerState state, - bool report_input_shapes = false, - bool profile_memory = false, - bool with_stack = false) - : state(state), - report_input_shapes(report_input_shapes), - profile_memory(profile_memory), - with_stack(with_stack) {} - ~ProfilerConfig(); - ProfilerState state; - bool report_input_shapes; - bool profile_memory; - bool with_stack; - - // Returns IValues corresponding to ProfilerConfig struct, to be used for - // serialization. - at::IValue toIValue() const; - - // Reconstructs a ProfilerConfig from IValues given by toIValue. - static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue); -}; - -// A struct to control settings of disableProfiler options. -struct TORCH_API ProfilerDisableOptions { - ProfilerDisableOptions() = default; - ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate) - : cleanupTLSState(shouldCleanupTLSState), - consolidate(shouldConsolidate) {} - // Whether we should clean up profiler states that are thread local, such as - // ThreadLocalDebugInfo and thread local RecordFunction callbacks. - bool cleanupTLSState = true; - // Whether we should consolidate all currently recorded profiled events. If - // false, will not consolidate and other threads can continue to write to the - // event lists. - bool consolidate = true; -}; - -// NOTE: profiler mode is thread local, with automatic propagation -// across thread boundary (e.g. at::launch tasks) -TORCH_API void enableProfilerLegacy(const ProfilerConfig&); -using thread_event_lists = std::vector>; -TORCH_API thread_event_lists disableProfilerLegacy(c10::optional profilerDisableOptions = c10::nullopt); - -// adds profiledEvents to the current thread local recorded events. Each event -// will be marked with node ID given by fromNodeId. -TORCH_API void addEventList(std::vector&& profiledEvents); -// Returns if the profiler is currently enabled in the current thread. -TORCH_API bool profilerEnabled(); -// Retrieve the thread_local ProfilerConfig. -TORCH_API ProfilerConfig getProfilerConfig(); -// Writes profiled events to a stream. -TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector& events); - -// Usage: -// { -// RecordProfile guard("filename.trace"); -// // code you want to profile -// } -// Then open filename.trace in chrome://tracing -struct TORCH_API RecordProfile { - RecordProfile(std::ostream& out); - RecordProfile(const std::string& filename); - - ~RecordProfile(); -private: - void init(); - std::unique_ptr file_; - std::ostream& out_; - void processEvents(const std::vector& events); -}; - -// A guard that enables the profiler, taking in an optional callback to process -// the results -// Usage: -// { -// TLSProfilerGuard g([](thread_event_lists profilerResults) { -// // process profilerResults -// }); -// Code to profile -// } -struct TORCH_API TLSProfilerGuard { - explicit TLSProfilerGuard( - const ProfilerConfig& cfg, - c10::optional> - resultCallback = c10::nullopt, - c10::optional profilerDisableOptions = - c10::nullopt) - : cb_(std::move(resultCallback)), - profilerDisableOptions_(std::move(profilerDisableOptions)) { - enableProfilerLegacy(cfg); - } - ~TLSProfilerGuard() { - thread_event_lists event_lists = disableProfilerLegacy(profilerDisableOptions_); - if (cb_) { - try { - (*cb_)(event_lists); - } catch (const std::exception& e) { - LOG(ERROR) << "Got error processing profiler events: " << e.what(); - } - } - } - - private: - c10::optional> cb_; - const c10::optional profilerDisableOptions_; -}; - -struct TORCH_API FileLineFunc { - std::string filename; - size_t line; - std::string funcname; -}; -TORCH_API std::vector prepareCallstack(const std::vector& cs); -TORCH_API std::vector callstackStr(const std::vector& cs); -TORCH_API std::vector> inputSizes(const at::RecordFunction& fn); - -struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { - explicit ProfilerThreadLocalState(const ProfilerConfig& config) - : config_(config), remoteProfiledEvents_{c10::nullopt} {} - ~ProfilerThreadLocalState() override = default; - - const ProfilerConfig& config() const; - - thread_event_lists consolidate(); - - void mark(std::string name, bool include_cuda = true); - - void setOrAddRemoteProfiledEvents( - std::vector&& remoteProfiledEvents); - - void pushRange( - const at::RecordFunction& fn, - const bool record_cuda, - const char* msg = "", - std::vector>&& shapes = {}); - - void popRange(const at::RecordFunction& fn, const bool record_cuda); - - void setCallbackHandle(at::CallbackHandle handle); - - at::CallbackHandle callbackHandle() const; - - void reportMemoryUsage( - void* /* unused */, - int64_t alloc_size, - c10::Device device) override; - - bool memoryProfilingEnabled() const override; - - virtual void reportClientActivity( - const at::RecordFunction& fn, - const at::ObserverContext* ctx) {} - - protected: - std::string getNvtxStr( - const at::StringView& name, - const char* msg, - int64_t sequence_nr, - const std::vector>& shapes) const; - - RangeEventList& getEventList(int64_t thread_id = -1); - - std::mutex state_mutex_; - std::unordered_map> - event_lists_map_; - - ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled); - at::CallbackHandle handle_ = 0; - c10::optional>> remoteProfiledEvents_; -}; - - -} // namespace profiler -}} // namespace torch::autograd +#include +#include diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index 4f429afb72c3..f4e5582fd6fe 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h new file mode 100644 index 000000000000..657a990019fa --- /dev/null +++ b/torch/csrc/autograd/profiler_legacy.h @@ -0,0 +1,536 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef _WIN32 +#include +#endif +#if defined(C10_IOS) && defined(C10_MOBILE) +#include // for gettimeofday() +#endif + +#include + +#include + +struct CUevent_st; +typedef std::shared_ptr CUDAEventStub; + +namespace torch { namespace autograd { + +struct Node; + +namespace profiler { + +struct TORCH_API CUDAStubs { + virtual void record(int* device, CUDAEventStub* event, int64_t* cpu_ns) { + fail(); + } + virtual float elapsed(const CUDAEventStub* event, const CUDAEventStub* event2) { + fail(); + return 0.f; + } + virtual void nvtxMarkA(const char* name) { + fail(); + } + virtual void nvtxRangePushA(const char* name) { + fail(); + } + virtual void nvtxRangePop() { + fail(); + } + virtual bool enabled() { + return false; + } + virtual void onEachDevice(std::function op) { + fail(); + } + virtual void synchronize() { + fail(); + } + virtual ~CUDAStubs(); + +private: + void fail() { + AT_ERROR("CUDA used in profiler but not enabled."); + } +}; + +TORCH_API void registerCUDAMethods(CUDAStubs* stubs); + +constexpr inline size_t ceilToMultiple(size_t a, size_t b) { + return ((a + b - 1) / b) * b; +} + +inline int64_t getTime() { +#if defined(C10_IOS) && defined(C10_MOBILE) +// clock_gettime is only available on iOS 10.0 or newer. Unlike OS X, iOS can't rely on +// CLOCK_REALTIME, as it is defined no matter if clock_gettime is implemented or not + struct timeval now; + gettimeofday(&now, NULL); + return static_cast(now.tv_sec) * 1000000000 + static_cast(now.tv_usec) * 1000; +#elif defined(_WIN32) || defined(__MACH__) + using namespace std::chrono; + using clock = std::conditional::type; + return duration_cast(clock::now().time_since_epoch()).count(); +#else + // clock_gettime is *much* faster than std::chrono implementation on Linux + struct timespec t{}; + clock_gettime(CLOCK_MONOTONIC, &t); + return static_cast(t.tv_sec) * 1000000000 + static_cast(t.tv_nsec); +#endif +} + +enum class C10_API_ENUM EventKind : uint16_t { + Mark, + PushRange, + PopRange, + MemoryAlloc, +}; + +// To be deprecated, once we switch to Kineto profiling +struct TORCH_API LegacyEvent { + LegacyEvent( + EventKind kind, + at::StringView name, + uint16_t thread_id, + bool record_cuda, + at::RecordFunctionHandle handle = 0, + std::vector>&& shapes = {}, + int node_id = -1) + : name_(std::move(name)), + kind_(kind), + thread_id_(thread_id), + handle_(handle), + shapes_(shapes), + node_id_(node_id) { + record(record_cuda); + } + + // Constructor to be used in conjunction with LegacyEvent::fromIValue. + LegacyEvent( + EventKind kind, + at::StringView name, + uint16_t thread_id, + at::RecordFunctionHandle handle, + std::vector>&& shapes, + int node_id, + bool is_remote, + int64_t cpu_memory_usage, + int64_t cpu_ns, + bool cuda_recorded, + int64_t cuda_memory_usage = 0, + int device = -1, + double cuda_us = -1) + : cpu_ns_(cpu_ns), + name_(std::move(name)), + kind_(kind), + thread_id_(thread_id), + handle_(handle), + shapes_(shapes), + cpu_memory_usage_(cpu_memory_usage), + cuda_memory_usage_(cuda_memory_usage), + device_(device), + node_id_(node_id), + is_remote_(is_remote), + cuda_us_(cuda_us) { + // Sanity check values that were deserialized + TORCH_INTERNAL_ASSERT(cpu_ns_ > 0); + if (cuda_recorded) { + TORCH_INTERNAL_ASSERT(device_ >= 0); + TORCH_INTERNAL_ASSERT(cuda_us_ >= 0); + } + } + + // Returns IValues corresponding to event structure, to be used for + // serialization. + at::IValue toIValue() const; + + // Reconstructs an event from IValues given by toIValue. + static LegacyEvent fromIValue(const at::IValue& eventIValue); + + void record(bool record_cuda); + + std::string kindStr() const { + switch (kind_) { + case EventKind::Mark: return "mark"; + case EventKind::PushRange: return "push"; + case EventKind::PopRange: return "pop"; + case EventKind::MemoryAlloc: return "memory_alloc"; + } + throw std::runtime_error("unknown event kind"); + } + + const char* name() const { + return name_.str(); + } + + uint64_t threadId() const { + return thread_id_; + } + + std::vector> shapes() const { + return shapes_; + } + + double cpuElapsedUs(const LegacyEvent& e) const { + return (e.cpu_ns_ - cpu_ns_)/(1000.0); + } + + double cpuUs() const { + return cpu_ns_ / (1000.0); + } + + double cudaElapsedUs(const LegacyEvent& e) const; + + bool hasCuda() const { + return cuda_event != nullptr || (isRemote() && device_ != -1); + } + + int device() const { + return device_; + } + + void updateMemoryStats(int64_t alloc_size, c10::Device device) { + if (device.type() == c10::DeviceType::CUDA || + device.type() == c10::DeviceType::HIP) { + cuda_memory_usage_ = alloc_size; + } else if (device.type() == c10::DeviceType::CPU || + device.type() == c10::DeviceType::MKLDNN || + device.type() == c10::DeviceType::IDEEP) { + cpu_memory_usage_ = alloc_size; + } else { + LOG(WARNING) << "Unsupported memory profiling device: " << device; + } + } + + int64_t cpuMemoryUsage() const { + return cpu_memory_usage_; + } + + int64_t cudaMemoryUsage() const { + return cuda_memory_usage_; + } + + at::RecordFunctionHandle handle() const { + return handle_; + } + + // Node ID corresponding to this event. + int nodeId( ) const { + return node_id_; + } + + // Set Node ID on this event. + void setNodeId(int node_id) { + node_id_ = node_id; + } + + void setName(at::StringView newName_) { + name_ = std::move(newName_); + } + + bool isRemote() const { + return is_remote_; + } + + void setCudaUs(int64_t cuda_us) { + cuda_us_ = cuda_us; + } + + void setSequenceNr(int64_t sequence_nr) { + sequence_nr_ = sequence_nr; + } + + int64_t sequenceNr() const { + return sequence_nr_; + } + + void setCorrelationId(uint64_t correlation_id) { + correlation_id_ = correlation_id; + } + + uint64_t correlationId() const { + return correlation_id_; + } + + const std::vector& stack() const { + return stack_; + } + + void setStack(const std::vector& stack) { + stack_ = stack; + } + + uint64_t fwdThreadId() const { + return fwd_thread_id_; + } + + void setFwdThreadId(uint64_t fwd_thread_id) { + fwd_thread_id_ = fwd_thread_id; + } + + uint8_t scope() const { + return scope_; + } + + void setScope(uint8_t scope) { + scope_ = scope; + } + + private: + // signed to allow for negative intervals, initialized for safety. + int64_t cpu_ns_ = 0; + at::StringView name_; + EventKind kind_; + uint64_t thread_id_; + uint64_t fwd_thread_id_; + at::RecordFunctionHandle handle_ {0}; + std::vector> shapes_; + int64_t cpu_memory_usage_ = 0; + int64_t cuda_memory_usage_ = 0; + int device_ = -1; + CUDAEventStub cuda_event = nullptr; + int node_id_ = 0; + bool is_remote_ = false; + int64_t cuda_us_ = -1; + int64_t sequence_nr_ = -1; + + std::vector stack_; + uint8_t scope_; + uint64_t correlation_id_; +}; + +// a linked-list of fixed sized vectors, to avoid +// a std::vector resize from taking a large amount of time inside +// a profiling event +struct RangeEventList { + RangeEventList() { + events_.reserve(kReservedCapacity); + } + + template + void record(Args&&... args) { + std::lock_guard guard(mutex_); + events_.emplace_back(std::forward(args)...); + } + + std::vector consolidate() { + std::lock_guard lock(mutex_); + std::vector result; + result.insert( + result.begin(), + std::make_move_iterator(events_.begin()), + std::make_move_iterator(events_.end())); + events_.erase(events_.begin(), events_.end()); + return result; + } + + size_t size() { + std::lock_guard lock(mutex_); + return events_.size(); + } + + private: + // This mutex is used to serialize access when different threads are writing + // to the same instance of RangeEventList. + std::mutex mutex_; + std::vector events_; + + static const size_t kReservedCapacity = 1024; +}; + +enum class C10_API_ENUM ProfilerState { + Disabled = 0, + CPU, // CPU-only profiling + CUDA, // CPU + CUDA events + NVTX, // only emit NVTX markers + KINETO, // use libkineto + NUM_PROFILER_STATES, // must be the last one +}; + +struct TORCH_API ProfilerConfig { + ProfilerConfig( + ProfilerState state, + bool report_input_shapes = false, + bool profile_memory = false, + bool with_stack = false) + : state(state), + report_input_shapes(report_input_shapes), + profile_memory(profile_memory), + with_stack(with_stack) {} + ~ProfilerConfig(); + ProfilerState state; + bool report_input_shapes; + bool profile_memory; + bool with_stack; + + // Returns IValues corresponding to ProfilerConfig struct, to be used for + // serialization. + at::IValue toIValue() const; + + // Reconstructs a ProfilerConfig from IValues given by toIValue. + static ProfilerConfig fromIValue(const at::IValue& profilerConfigIValue); +}; + +// A struct to control settings of disableProfiler options. +struct TORCH_API ProfilerDisableOptions { + ProfilerDisableOptions() = default; + ProfilerDisableOptions(bool shouldCleanupTLSState, bool shouldConsolidate) + : cleanupTLSState(shouldCleanupTLSState), + consolidate(shouldConsolidate) {} + // Whether we should clean up profiler states that are thread local, such as + // ThreadLocalDebugInfo and thread local RecordFunction callbacks. + bool cleanupTLSState = true; + // Whether we should consolidate all currently recorded profiled events. If + // false, will not consolidate and other threads can continue to write to the + // event lists. + bool consolidate = true; +}; + +// NOTE: profiler mode is thread local, with automatic propagation +// across thread boundary (e.g. at::launch tasks) +TORCH_API void enableProfilerLegacy(const ProfilerConfig&); +using thread_event_lists = std::vector>; +TORCH_API thread_event_lists disableProfilerLegacy(c10::optional profilerDisableOptions = c10::nullopt); + +// adds profiledEvents to the current thread local recorded events. Each event +// will be marked with node ID given by fromNodeId. +TORCH_API void addEventList(std::vector&& profiledEvents); +// Returns if the profiler is currently enabled in the current thread. +TORCH_API bool profilerEnabled(); +// Retrieve the thread_local ProfilerConfig. +TORCH_API ProfilerConfig getProfilerConfig(); +// Writes profiled events to a stream. +TORCH_API void writeProfilerEventsToStream(std::ostream& out, const std::vector& events); + +// Usage: +// { +// RecordProfile guard("filename.trace"); +// // code you want to profile +// } +// Then open filename.trace in chrome://tracing +struct TORCH_API RecordProfile { + RecordProfile(std::ostream& out); + RecordProfile(const std::string& filename); + + ~RecordProfile(); +private: + void init(); + std::unique_ptr file_; + std::ostream& out_; + void processEvents(const std::vector& events); +}; + +// A guard that enables the profiler, taking in an optional callback to process +// the results +// Usage: +// { +// TLSProfilerGuard g([](thread_event_lists profilerResults) { +// // process profilerResults +// }); +// Code to profile +// } +struct TORCH_API TLSProfilerGuard { + explicit TLSProfilerGuard( + const ProfilerConfig& cfg, + c10::optional> + resultCallback = c10::nullopt, + c10::optional profilerDisableOptions = + c10::nullopt) + : cb_(std::move(resultCallback)), + profilerDisableOptions_(std::move(profilerDisableOptions)) { + enableProfilerLegacy(cfg); + } + ~TLSProfilerGuard() { + thread_event_lists event_lists = disableProfilerLegacy(profilerDisableOptions_); + if (cb_) { + try { + (*cb_)(event_lists); + } catch (const std::exception& e) { + LOG(ERROR) << "Got error processing profiler events: " << e.what(); + } + } + } + + private: + c10::optional> cb_; + const c10::optional profilerDisableOptions_; +}; + +struct TORCH_API FileLineFunc { + std::string filename; + size_t line; + std::string funcname; +}; +TORCH_API std::vector prepareCallstack(const std::vector& cs); +TORCH_API std::vector callstackStr(const std::vector& cs); +TORCH_API std::vector> inputSizes(const at::RecordFunction& fn); + +struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase { + explicit ProfilerThreadLocalState(const ProfilerConfig& config) + : config_(config), remoteProfiledEvents_{c10::nullopt} {} + ~ProfilerThreadLocalState() override = default; + + const ProfilerConfig& config() const; + + thread_event_lists consolidate(); + + void mark(std::string name, bool include_cuda = true); + + void setOrAddRemoteProfiledEvents( + std::vector&& remoteProfiledEvents); + + void pushRange( + const at::RecordFunction& fn, + const bool record_cuda, + const char* msg = "", + std::vector>&& shapes = {}); + + void popRange(const at::RecordFunction& fn, const bool record_cuda); + + void setCallbackHandle(at::CallbackHandle handle); + + at::CallbackHandle callbackHandle() const; + + void reportMemoryUsage( + void* /* unused */, + int64_t alloc_size, + c10::Device device) override; + + bool memoryProfilingEnabled() const override; + + virtual void reportClientActivity( + const at::RecordFunction& fn, + const at::ObserverContext* ctx) {} + + protected: + std::string getNvtxStr( + const at::StringView& name, + const char* msg, + int64_t sequence_nr, + const std::vector>& shapes) const; + + RangeEventList& getEventList(int64_t thread_id = -1); + + std::mutex state_mutex_; + std::unordered_map> + event_lists_map_; + + ProfilerConfig config_ = ProfilerConfig(ProfilerState::Disabled); + at::CallbackHandle handle_ = 0; + c10::optional>> remoteProfiledEvents_; +}; + + +} // namespace profiler +}} // namespace torch::autograd From bb6396a74b54a6babc6082dcafa00f0549af0beb Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 13:15:11 -0800 Subject: [PATCH 34/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/init.cpp | 2 +- torch/csrc/autograd/profiler_kineto.cpp | 37 +++++++++++++++++++++---- torch/csrc/autograd/profiler_legacy.h | 4 +++ 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 56fefb103c37..b844c4349fc6 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index d9864ec515c3..cbb52a06d446 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -3,18 +3,44 @@ #include #include +#include + +#ifndef _WIN32 +#include +#endif + #ifdef USE_KINETO #include "libkineto.h" namespace torch { namespace autograd { namespace profiler { namespace { -// TODO: TLS +// TODO: consider TLS std::atomic corr_id_ {1}; uint64_t next_correlation_id() { return corr_id_++; } +std::string shapesToStr(const std::vector>& shapes) { + std::ostringstream oss; + oss << "["; + for (auto t_idx = 0; t_idx < shapes.size(); ++t_idx) { + if (t_idx > 0) { + oss << ", "; + } + oss << "["; + for (auto s_idx = 0; s_idx < shapes[t_idx].size(); ++s_idx) { + if (s_idx > 0) { + oss << ", "; + } + oss << shapes[t_idx][s_idx]; + } + oss << "]"; + } + oss << "]"; + return oss.str(); +} + struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { using ProfilerThreadLocalState::ProfilerThreadLocalState; virtual ~KinetoThreadLocalState() override = default; @@ -28,15 +54,16 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { "Supported only in Kineto profiler"); libkineto::ClientTraceActivity op; op.startTime = ctx->startUs; - op.endTime = (getTime() / 1000); + op.endTime = getTimeUs(); op.opType = std::string(fn.name().str()); op.device = 0; // CPU op.correlation = ctx->correlationId; if (ctx->shapes && !ctx->shapes->empty()) { - //op.inputDims = toStr(*ctx->shapes); // + op.inputDims = shapesToStr(*ctx->shapes); } - //op.threadId = pthread_self(); - +#ifndef _WIN32 + op.threadId = pthread_self(); +#endif { std::lock_guard guard(state_mutex_); cpu_trace->ops.emplace_back(std::move(op)); diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h index 657a990019fa..c3efd63a16c4 100644 --- a/torch/csrc/autograd/profiler_legacy.h +++ b/torch/csrc/autograd/profiler_legacy.h @@ -90,6 +90,10 @@ inline int64_t getTime() { #endif } +inline int64_t getTimeUs() { + return getTime() / 1000; +} + enum class C10_API_ENUM EventKind : uint16_t { Mark, PushRange, From 60b5dee47a0dede585b266b3c9b00ddb84cc3a44 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 13:21:59 -0800 Subject: [PATCH 35/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler_kineto.cpp | 44 ++++++++++++++----------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index cbb52a06d446..530ea007eed7 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -11,35 +11,19 @@ #ifdef USE_KINETO #include "libkineto.h" +#endif namespace torch { namespace autograd { namespace profiler { +#ifdef USE_KINETO namespace { // TODO: consider TLS -std::atomic corr_id_ {1}; uint64_t next_correlation_id() { + static std::atomic corr_id_ {1}; return corr_id_++; } -std::string shapesToStr(const std::vector>& shapes) { - std::ostringstream oss; - oss << "["; - for (auto t_idx = 0; t_idx < shapes.size(); ++t_idx) { - if (t_idx > 0) { - oss << ", "; - } - oss << "["; - for (auto s_idx = 0; s_idx < shapes[t_idx].size(); ++s_idx) { - if (s_idx > 0) { - oss << ", "; - } - oss << shapes[t_idx][s_idx]; - } - oss << "]"; - } - oss << "]"; - return oss.str(); -} +std::string shapesToStr(const std::vector>& shapes); struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { using ProfilerThreadLocalState::ProfilerThreadLocalState; @@ -148,6 +132,26 @@ void pushProfilingCallbacks() { state_ptr->setCallbackHandle(handle); } +std::string shapesToStr(const std::vector>& shapes) { + std::ostringstream oss; + oss << "["; + for (auto t_idx = 0; t_idx < shapes.size(); ++t_idx) { + if (t_idx > 0) { + oss << ", "; + } + oss << "["; + for (auto s_idx = 0; s_idx < shapes[t_idx].size(); ++s_idx) { + if (s_idx > 0) { + oss << ", "; + } + oss << shapes[t_idx][s_idx]; + } + oss << "]"; + } + oss << "]"; + return oss.str(); +} + } // namespace void prepareProfiler( From e1a5480012e27741c52d6806692cec99cbd71624 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 13:24:40 -0800 Subject: [PATCH 36/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler_kineto.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 530ea007eed7..33ce86603280 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -88,7 +88,7 @@ void pushProfilingCallbacks() { libkineto::api().pushCorrelationId(corr_id); auto ctx_ptr = std::make_unique(); - ctx_ptr->startUs = getTime() / 1000; + ctx_ptr->startUs = getTimeUs(); ctx_ptr->correlationId = corr_id; ctx_ptr->startThreadId = at::RecordFunction::currentThreadId(); @@ -194,7 +194,7 @@ void enableProfiler( c10::ThreadLocalDebugInfo::_push(c10::DebugInfoKind::PROFILER_STATE, state); state->cpu_trace = std::make_unique(); - state->cpu_trace->span.startTime = getTime() / 1000; + state->cpu_trace->span.startTime = getTimeUs(); state->cpu_trace->gpuOpCount = -1; state->cpu_trace->span.name = "PyTorch Profiler"; @@ -229,7 +229,7 @@ ProfilerResult disableProfiler() { state_ptr->mark("__stop_profile"); - state_ptr->cpu_trace->span.endTime = getTime() / 1000; + state_ptr->cpu_trace->span.endTime = getTimeUs(); libkineto::api().transferCpuTrace(std::move(state_ptr->cpu_trace)); From 38a37dd6eeab50f5d577d992d9e92b4062fa05c5 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 13:31:16 -0800 Subject: [PATCH 37/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler_kineto.cpp | 2 +- torch/csrc/autograd/profiler_legacy.cpp | 8 -------- torch/csrc/autograd/profiler_legacy.h | 12 ++++++++++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 33ce86603280..27a395182496 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -223,7 +223,7 @@ ProfilerResult disableProfiler() { TORCH_CHECK(state_ptr && state_ptr->config().state == ProfilerState::KINETO, "Can't disable Kineto profiler when it's not running"); - if (state_ptr->callbackHandle() > 0) { + if (state_ptr->hasCallbackHandle()) { at::removeCallback(state_ptr->callbackHandle()); } diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp index 2fe3bdb451cc..a8e37d45ee7e 100644 --- a/torch/csrc/autograd/profiler_legacy.cpp +++ b/torch/csrc/autograd/profiler_legacy.cpp @@ -260,14 +260,6 @@ void ProfilerThreadLocalState::popRange(const at::RecordFunction& fn, const bool } } -void ProfilerThreadLocalState::setCallbackHandle(at::CallbackHandle handle) { - handle_ = handle; -} - -at::CallbackHandle ProfilerThreadLocalState::callbackHandle() const { - return handle_; -} - void ProfilerThreadLocalState::reportMemoryUsage( void* /* unused */, int64_t alloc_size, diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h index c3efd63a16c4..4ea5c0e830ce 100644 --- a/torch/csrc/autograd/profiler_legacy.h +++ b/torch/csrc/autograd/profiler_legacy.h @@ -502,9 +502,17 @@ struct TORCH_API ProfilerThreadLocalState : public c10::MemoryReportingInfoBase void popRange(const at::RecordFunction& fn, const bool record_cuda); - void setCallbackHandle(at::CallbackHandle handle); + void setCallbackHandle(at::CallbackHandle handle) { + handle_ = handle; + } - at::CallbackHandle callbackHandle() const; + at::CallbackHandle callbackHandle() const { + return handle_; + } + + bool hasCallbackHandle() { + return handle_ > 0; + } void reportMemoryUsage( void* /* unused */, From c6c603972ac71d0c2793eb136176879740dc8a4a Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 14:16:52 -0800 Subject: [PATCH 38/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler_kineto.cpp | 20 ++++++++------------ torch/csrc/autograd/profiler_kineto.h | 9 ++++++++- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 27a395182496..d730ac6e7f8d 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -50,27 +50,30 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { #endif { std::lock_guard guard(state_mutex_); - cpu_trace->ops.emplace_back(std::move(op)); kineto_events_.emplace_back(); kineto_events_.back() + .activity(op) .startThreadId(ctx->startThreadId) .endThreadId(ctx->endThreadId) .sequenceNr(ctx->sequenceNr) .fwdThreadId(ctx->fwdThreadId) - .scope(ctx->recFunScope); + .scope(ctx->recFunScope) + .deviceType(c10::DeviceType::CPU) + .shapes(*ctx->shapes); if (ctx->stack && !ctx->stack->empty()) { kineto_events_.back().stack(*ctx->stack); } + cpu_trace->ops.emplace_back(std::move(op)); } } std::vector kineto_events_; - std::unique_ptr cpu_trace; }; KinetoThreadLocalState* getProfilerTLSState() { - const auto& state = c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PROFILER_STATE); + const auto& state = c10::ThreadLocalDebugInfo::get( + c10::DebugInfoKind::PROFILER_STATE); return dynamic_cast(state.get()); } @@ -172,6 +175,7 @@ void prepareProfiler( k_activities.insert(libkineto::ActivityType::GPU_MEMCPY); k_activities.insert(libkineto::ActivityType::GPU_MEMSET); k_activities.insert(libkineto::ActivityType::CONCURRENT_KERNEL); + k_activities.insert(libkineto::ActivityType::CUDA_RUNTIME); } //if (!libkineto::api().hasProfilerRegistered()) { @@ -239,14 +243,6 @@ ProfilerResult disableProfiler() { return ProfilerResult(kineto_events, legacy_events); } -KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) { - name_ = activity.name(); - device_index_ = activity.deviceId(); - start_us_ = activity.timestamp(); - duration_us_ = activity.duration(); - correlation_id_ = activity.correlationId(); - return *this; -} #endif bool kinetoAvailable() { diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index f4e5582fd6fe..889be6456473 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -118,7 +118,14 @@ struct TORCH_API KinetoEvent { // Kineto fields - KinetoEvent& activity(const libkineto::TraceActivity& activity); + KinetoEvent& activity(const libkineto::TraceActivity& activity) { + name_ = activity.name(); + device_index_ = activity.deviceId(); + start_us_ = activity.timestamp(); + duration_us_ = activity.duration(); + correlation_id_ = activity.correlationId(); + return *this; + } std::string name() const { return name_; From 17767d1a5a1791c2d245e627d4983c9a1e669a81 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 14:32:09 -0800 Subject: [PATCH 39/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler_kineto.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index d730ac6e7f8d..48749c820156 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -59,7 +59,9 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { .fwdThreadId(ctx->fwdThreadId) .scope(ctx->recFunScope) .deviceType(c10::DeviceType::CPU) - .shapes(*ctx->shapes); + if (ctx->shapes && !ctx->shapes.empty()) { + kineto_events_.back().shapes(*ctx->shapes); + } if (ctx->stack && !ctx->stack->empty()) { kineto_events_.back().stack(*ctx->stack); } From 3537e9d3b95998a9e9ba0efb1ec4f240a72e02d0 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 15:57:34 -0800 Subject: [PATCH 40/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/init.cpp | 44 ++++++++++++++------ torch/csrc/autograd/profiler_kineto.cpp | 54 ++++++++++++++++++------- torch/csrc/autograd/profiler_kineto.h | 29 +++++++------ 3 files changed, 87 insertions(+), 40 deletions(-) diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index b844c4349fc6..94d5476f6080 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -73,21 +73,41 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { #ifdef USE_KINETO py::class_(m, "KinetoEvent") .def("name", &KinetoEvent::name) - .def("start_thread_id", [](const KinetoEvent& e) { return e.startThreadId(); }) - .def("end_thread_id", [](const KinetoEvent& e) { return e.endThreadId(); }) + .def("start_thread_id", [](const KinetoEvent& e) { + return e.startThreadId(); + }) + .def("end_thread_id", [](const KinetoEvent& e) { + return e.endThreadId(); + }) .def("device_index", &KinetoEvent::deviceIndex) .def("start_us", &KinetoEvent::startUs) .def("duration_us", &KinetoEvent::durationUs) - .def("correlation_id", [](const KinetoEvent& e) { return e.correlationId(); }) - .def("fwd_thread_id", [](const KinetoEvent& e) { return e.fwdThreadId(); }) - .def("shapes", [](const KinetoEvent& e) { return e.shapes(); }) - .def("sequence_nr", [](const KinetoEvent& e) { return e.sequenceNr(); }) - .def("stack", [](const KinetoEvent& e) { return e.stack(); }) - .def("scope", [](const KinetoEvent& e) { return e.scope(); }); - - py::class_(m, "ProfilerResult") - .def("events", &ProfilerResult::events) - .def("legacy_events", &ProfilerResult::legacy_events); + .def("correlation_id", [](const KinetoEvent& e) { + return e.correlationId(); + }) + .def("fwd_thread_id", [](const KinetoEvent& e) { + return e.fwdThreadId(); + }) + .def("shapes", [](const KinetoEvent& e) { + return e.shapes(); + }) + .def("sequence_nr", [](const KinetoEvent& e) { + return e.sequenceNr(); + }) + .def("stack", [](const KinetoEvent& e) { + return e.stack(); + }) + .def("scope", [](const KinetoEvent& e) { + return e.scope(); + }); + + py::class_(m, "ProfilerResult") + .def("events", [](const ProfilerResultWrapper& r) { + return r.result_->events(); + }) + .def("legacy_events", [](const ProfilerResultWrapper& r) { + return r.result_->legacy_events(); + }); m.def("_enable_profiler", enableProfiler); m.def("_disable_profiler", disableProfiler); diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 48749c820156..09f2563d4b63 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -34,8 +34,9 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { const at::ObserverContext* observer_ctx) override { auto ctx = dynamic_cast(observer_ctx); TORCH_CHECK(ctx); - TORCH_CHECK(config_.state == ProfilerState::KINETO, - "Supported only in Kineto profiler"); + if (!ctx) { + return; + } libkineto::ClientTraceActivity op; op.startTime = ctx->startUs; op.endTime = getTimeUs(); @@ -58,8 +59,8 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { .sequenceNr(ctx->sequenceNr) .fwdThreadId(ctx->fwdThreadId) .scope(ctx->recFunScope) - .deviceType(c10::DeviceType::CPU) - if (ctx->shapes && !ctx->shapes.empty()) { + .deviceType(c10::DeviceType::CPU); + if (ctx->shapes && !ctx->shapes->empty()) { kineto_events_.back().shapes(*ctx->shapes); } if (ctx->stack && !ctx->stack->empty()) { @@ -69,6 +70,15 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { } } + void addTraceEvents(libkineto::ActivityTraceInterface& trace) { + // tbd + } + + std::vector> events() { + // tbd + return std::vector>(); + } + std::vector kineto_events_; std::unique_ptr cpu_trace; }; @@ -215,13 +225,7 @@ void enableProfiler( state->mark("__start_profile", false); } -std::vector> filterTrace( - std::unique_ptr&& trace) { - // tbd - return std::vector>(); -} - -ProfilerResult disableProfiler() { +ProfilerResultWrapper disableProfiler() { // all the DebugInfoBase objects are scope based and supposed to use DebugInfoGuard auto state = c10::ThreadLocalDebugInfo::_pop(c10::DebugInfoKind::PROFILER_STATE); @@ -239,12 +243,32 @@ ProfilerResult disableProfiler() { libkineto::api().transferCpuTrace(std::move(state_ptr->cpu_trace)); - std::vector> kineto_events = filterTrace( - std::move(libkineto::api().stopTrace())); - auto legacy_events = state_ptr->consolidate(); - return ProfilerResult(kineto_events, legacy_events); + auto trace = std::move(libkineto::api().stopTrace()); + TORCH_CHECK(trace); + state_ptr->addTraceEvents(*trace); + return ProfilerResultWrapper(std::make_shared( + std::move(state_ptr->events()), + std::move(state_ptr->consolidate()), + std::move(trace))); } +KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) { + name_ = activity.name(); + device_index_ = activity.deviceId(); + start_us_ = activity.timestamp(); + duration_us_ = activity.duration(); + correlation_id_ = activity.correlationId(); + return *this; +} + +ProfilerResult::ProfilerResult( + std::vector> events, + thread_event_lists legacy_events, + std::unique_ptr trace) + : events_(std::move(events)), + legacy_events_(std::move(legacy_events)), + trace_(std::move(trace)) {} + #endif bool kinetoAvailable() { diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index 889be6456473..916cba70a1ca 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -3,10 +3,12 @@ #include #include +//#include #ifdef USE_KINETO namespace libkineto { class TraceActivity; +class ActivityTraceInterface; } #endif @@ -118,14 +120,7 @@ struct TORCH_API KinetoEvent { // Kineto fields - KinetoEvent& activity(const libkineto::TraceActivity& activity) { - name_ = activity.name(); - device_index_ = activity.deviceId(); - start_us_ = activity.timestamp(); - duration_us_ = activity.duration(); - correlation_id_ = activity.correlationId(); - return *this; - } + KinetoEvent& activity(const libkineto::TraceActivity& activity); std::string name() const { return name_; @@ -171,11 +166,11 @@ struct TORCH_API KinetoEvent { struct TORCH_API ProfilerResult { ProfilerResult( - const std::vector>& events, - const thread_event_lists& legacy_events) - : events_(events), legacy_events_(legacy_events) {} + std::vector> events, + thread_event_lists legacy_events, + std::unique_ptr trace); - const std::vector> events() const { + const std::vector>& events() const { return events_; } @@ -186,13 +181,21 @@ struct TORCH_API ProfilerResult { private: std::vector> events_; thread_event_lists legacy_events_; // tensor mem alloc, start/stop + std::unique_ptr trace_; +}; + +// avoid unique_ptr cophy issues when using pybind +struct TORCH_API ProfilerResultWrapper { + ProfilerResultWrapper(const std::shared_ptr& result) + : result_(result) {} + std::shared_ptr result_; }; TORCH_API void enableProfiler( const ProfilerConfig& config, const std::set& activities); -TORCH_API ProfilerResult disableProfiler(); +TORCH_API ProfilerResultWrapper disableProfiler(); TORCH_API void prepareProfiler( const ProfilerConfig& config, From 043dcd2ac4e6f6e3b86c0c04546ca2bc6e376363 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 3 Nov 2020 16:52:37 -0800 Subject: [PATCH 41/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/profiler_kineto.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 09f2563d4b63..f812f0b83d38 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -194,7 +194,7 @@ void prepareProfiler( // libkineto::api().registerProfiler( // std::make_unique(false)); //} - //libkineto::api().initProfilerIfRegistered(); + libkineto::api().initProfilerIfRegistered(); libkineto::api().prepareTrace(k_activities); } From 67d4acb9ad6d173d07edbccfc6bb46fe08aecf1e Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Wed, 11 Nov 2020 12:29:17 -0800 Subject: [PATCH 42/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/profiler.py | 77 +++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 34 deletions(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index bcf10dfb8699..380325464575 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -89,15 +89,15 @@ def populate_cpu_children(self): for thread_id, thread_events in threads: thread_events_ = sorted( thread_events, - key=lambda event: [event.cpu_interval.start, -event.cpu_interval.end], + key=lambda event: [event.time_range.start, -event.time_range.end], ) current_events: List[FunctionEvent] = [] cur_end = 0 for event in thread_events_: while len(current_events) > 0: parent = current_events[-1] - if event.cpu_interval.start >= parent.cpu_interval.end or \ - event.cpu_interval.end > parent.cpu_interval.end: + if event.time_range.start >= parent.time_range.end or \ + event.time_range.end > parent.time_range.end: # this can't be a parent current_events.pop() else: @@ -205,8 +205,8 @@ def export_chrome_trace(self, path): '"args": {}}, ' % ( evt.name, - evt.cpu_interval.start, - evt.cpu_interval.elapsed_us(), + evt.time_range.start, + evt.time_range.elapsed_us(), evt.thread if not evt.is_remote else f'" node_id:{evt.node_id}, thread_id:{evt.thread} "', @@ -222,7 +222,7 @@ def export_chrome_trace(self, path): '"pid": "CPU functions", ' '"id": %s, ' '"cat": "cpu_to_cuda", ' - '"args": {}}, ' % (evt.name, evt.cpu_interval.start, + '"args": {}}, ' % (evt.name, evt.time_range.start, evt.thread, next_id)) f.write('{"name": "%s", ' '"ph": "f", ' @@ -433,22 +433,15 @@ def __exit__(self, exc_type, exc_val, exc_tb): if not self.enabled: return if self.kineto_activities: - result = torch.autograd._disable_profiler() - # - for evt_list in result.legacy_events(): - for evt in evt_list: - print(evt, evt.kind(), flush=True) - print() - for evt in result.events(): - print(" ", evt.name(), evt.start_thread_id(), evt.end_thread_id(), evt.device_index(), evt.device_resource_id(), evt.start_us(), evt.duration_us(), evt.correlation_id(), evt.fwd_thread_id()) - # - self.function_events = parse_profiler_result(result) + results = torch.autograd._disable_profiler() + parsed_results = parse_kineto_results(results) else: records = torch.autograd._disable_profiler_legacy() - self.function_events = EventList( - parse_event_records(records), - use_cuda=self.use_cuda, - profile_memory=self.profile_memory) + parsed_results = parse_legacy_records(records) + self.function_events = EventList( + parsed_results, + use_cuda=self.use_cuda, + profile_memory=self.profile_memory) if self.with_stack: self.function_events.set_backward_stacktraces() return False @@ -779,13 +772,13 @@ def elapsed_us(self): class FunctionEvent(FormattedTimesMixin): """Profiling information about a single function.""" def __init__( - self, id, node_id, name, thread, cpu_start, cpu_end, fwd_thread=None, input_shapes=None, + self, id, node_id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None, stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, is_remote=True, sequence_nr=-1): self.id: int = id self.node_id: int = node_id self.name: str = name - self.cpu_interval: Interval = Interval(cpu_start, cpu_end) + self.time_range: Interval = Interval(start_us, end_us) self.thread: int = thread self.fwd_thread: Optional[int] = fwd_thread self.kernels: List[Kernel] = [] @@ -860,7 +853,7 @@ def self_cuda_time_total(self): @property def cpu_time_total(self): - return self.cpu_interval.elapsed_us() + return self.time_range.elapsed_us() @property def key(self): @@ -868,14 +861,14 @@ def key(self): def __repr__(self): return ( - ''.format( self.id, self.node_id, self.cpu_time_str, - self.cpu_interval.start, - self.cpu_interval.end, + self.time_range.start, + self.time_range.end, str([child.id for child in self.cpu_children]), self.cuda_time_str, self.name, @@ -971,10 +964,26 @@ def __missing__(self, key): self[key] = torch._C._demangle(key) if len(key) > 1 else key return self[key] -def parse_event_records(thread_records): +# Parsing of kineto profiler events +def parse_kineto_results(result): + # + for evt_list in result.legacy_events(): + for evt in evt_list: + print(evt, evt.kind(), flush=True) + print() + for evt in result.events(): + print(" ", evt.name(), evt.start_thread_id(), evt.end_thread_id(), evt.device_index(), evt.device_resource_id(), evt.start_us(), evt.duration_us(), evt.correlation_id(), evt.fwd_thread_id()) + # + return [] + # result.events() has most of the events - PyTorch op-level and device-level events + # result.legacy_events() has events not yet ported to kineto + # (e.g. start/stop marks, tensor memory allocator events) + +# Parsing of legacy profiler events +def parse_legacy_records(thread_records): def get_record_key(record): """ - Returns a tuple to be used by parse_event_records for correlating start and + Returns a tuple to be used by parse_legacy_records for correlating start and end records. """ return (record.handle(), record.node_id()) @@ -1083,8 +1092,8 @@ def adjusted_time(cuda_record, cuda_records_map): node_id=record.node_id(), name=string_table[start.name()], thread=start.thread_id(), - cpu_start=start_record.cpu_elapsed_us(start), - cpu_end=start_record.cpu_elapsed_us(record), + start_us=start_record.cpu_elapsed_us(start), + end_us=start_record.cpu_elapsed_us(record), fwd_thread=start.fwd_thread_id(), input_shapes=start.shapes(), stack=[entry for entry in start.stack() if filter_stack_entry(entry)], @@ -1122,7 +1131,7 @@ def adjusted_time(cuda_record, cuda_records_map): # granularity of the given clock tick)--we always show # the outermost nested call first. This adds stability # in how FunctionEvents appear - functions.sort(key=lambda evt: [evt.cpu_interval.start, -evt.cpu_interval.end]) + functions.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end]) return functions @@ -1169,8 +1178,8 @@ def parse_nvprof_trace(path): node_id=0, # missing a node_id when calling FunctionEvent. This is just to ensure # that pytorch doesn't crash when creating a FunctionEvent() object name=strings[row['name']], - cpu_start=row['start_time'], - cpu_end=row['end_time'], + start_us=row['start_time'], + end_us=row['end_time'], thread=0) # TODO: find in sqlite database functions.append(evt) functions_map[evt.id] = evt @@ -1201,7 +1210,7 @@ def parse_nvprof_trace(path): row['kernel_start'], row['kernel_end']) - functions.sort(key=lambda evt: evt.cpu_interval.start) + functions.sort(key=lambda evt: evt.time_range.start) return functions From 9f1d24fa938d8f4ca31080b9136d398e8cc7703c Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Wed, 11 Nov 2020 12:53:49 -0800 Subject: [PATCH 43/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/profiler.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 380325464575..0d48ff1f1b41 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -966,19 +966,22 @@ def __missing__(self, key): # Parsing of kineto profiler events def parse_kineto_results(result): - # - for evt_list in result.legacy_events(): - for evt in evt_list: - print(evt, evt.kind(), flush=True) - print() - for evt in result.events(): - print(" ", evt.name(), evt.start_thread_id(), evt.end_thread_id(), evt.device_index(), evt.device_resource_id(), evt.start_us(), evt.duration_us(), evt.correlation_id(), evt.fwd_thread_id()) - # - return [] # result.events() has most of the events - PyTorch op-level and device-level events # result.legacy_events() has events not yet ported to kineto # (e.g. start/stop marks, tensor memory allocator events) + # First, find __start_profile mark to get the absolute time of the start of the trace + start_record = None + for record in itertools.chain(*result.legacy_events()): + if record.kind() == 'mark' and record.name() == '__start_profile': + assert start_record is None + start_record = record + assert start_record is not None, "Invalid profiler output, __start_profile is missing" + + # Create and return FunctionEvent list + function_events = [] + return function_events + # Parsing of legacy profiler events def parse_legacy_records(thread_records): def get_record_key(record): From e86420571e5458128a62d3b0ee11bc5b8b18f80b Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Wed, 11 Nov 2020 13:04:28 -0800 Subject: [PATCH 44/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/profiler.py | 80 +++++++++++++++++++++++++++----------- 1 file changed, 58 insertions(+), 22 deletions(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 0d48ff1f1b41..4e3c0b9ca15b 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -964,22 +964,78 @@ def __missing__(self, key): self[key] = torch._C._demangle(key) if len(key) > 1 else key return self[key] +def filter_stack_entry(entry): + filtered_entries = [ + ("autograd/__init__", "_make_grads"), + ("autograd/__init__", "backward"), + ("torch/tensor", "backward"), + ("_internal/common_utils", "prof_callable"), + ("_internal/common_utils", "prof_func_call"), + ("_internal/common_utils", "prof_meth_call"), + ] + return all([not (f[0] in entry and f[1] in entry) for f in filtered_entries]) + +def filter_name(name): + # ignoring the following utility ops + filtered_out_names = [ + "profiler::_record_function_enter", + "profiler::_record_function_exit", + "aten::is_leaf", + "aten::output_nr", + "aten::_version", + ] + return name in filtered_out_names + # Parsing of kineto profiler events def parse_kineto_results(result): # result.events() has most of the events - PyTorch op-level and device-level events # result.legacy_events() has events not yet ported to kineto # (e.g. start/stop marks, tensor memory allocator events) - # First, find __start_profile mark to get the absolute time of the start of the trace + # First, find __start_profile mark to get the absolute time of the start of the trace; + # save memory allocation records start_record = None + mem_records = [] for record in itertools.chain(*result.legacy_events()): if record.kind() == 'mark' and record.name() == '__start_profile': assert start_record is None start_record = record + if record.kind() == 'memory_alloc': + mem_records.append(record) assert start_record is not None, "Invalid profiler output, __start_profile is missing" # Create and return FunctionEvent list function_events = [] + for kineto_event in result.events(): + fe = FunctionEvent( + id=record.handle(), + node_id=record.node_id(), + name=string_table[start.name()], + thread=start.thread_id(), + start_us=start_record.cpu_elapsed_us(start), + end_us=start_record.cpu_elapsed_us(record), + fwd_thread=start.fwd_thread_id(), + input_shapes=start.shapes(), + stack=[entry for entry in start.stack() if filter_stack_entry(entry)], + scope=start.scope(), + cpu_memory_usage=cpu_memory_usage, + cuda_memory_usage=cuda_memory_usage, + is_async=is_async, + is_remote=is_remote_event, + sequence_nr=start.sequence_nr(), + ) + # note: async events have only cpu total time + if not is_async and start.has_cuda(): + cuda_start = adjusted_time(start, cuda_records) + cuda_end = adjusted_time(record, cuda_records) + if (cuda_end - cuda_start) > 0: + fe.append_kernel( + start.name(), + start.device(), + cuda_start, + cuda_end) + function_events.append(fe) + function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end]) return function_events # Parsing of legacy profiler events @@ -998,26 +1054,6 @@ def get_record_key(record): record_stack = [] string_table = StringTable() - # ignoring the following utility ops - filtered_out_names = [ - "profiler::_record_function_enter", - "profiler::_record_function_exit", - "aten::is_leaf", - "aten::output_nr", - "aten::_version", - ] - - def filter_stack_entry(entry): - filtered_entries = [ - ("autograd/__init__", "_make_grads"), - ("autograd/__init__", "backward"), - ("torch/tensor", "backward"), - ("_internal/common_utils", "prof_callable"), - ("_internal/common_utils", "prof_func_call"), - ("_internal/common_utils", "prof_meth_call"), - ] - return all([not (f[0] in entry and f[1] in entry) for f in filtered_entries]) - # cuda start events and the overall profiler start event don't happen # at exactly the same time because we need to record an event on each device # and each record takes ~4us. So we adjust here by the difference @@ -1054,7 +1090,7 @@ def adjusted_time(cuda_record, cuda_records_map): prev_record = None for record in thread_record_list: record_key = get_record_key(record) - if (record.name() in filtered_out_names or + if (filter_name(record.name()) or record_key in filtered_handles): filtered_handles.add(record_key) continue From 380b874e628290877b75a9da955336a75e145004 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Wed, 11 Nov 2020 14:17:00 -0800 Subject: [PATCH 45/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/profiler.py | 44 ++++++++------------ torch/csrc/autograd/init.cpp | 53 ++++++++++++++++++------- torch/csrc/autograd/profiler_kineto.cpp | 18 +++++++++ torch/csrc/autograd/profiler_kineto.h | 2 + 4 files changed, 76 insertions(+), 41 deletions(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 4e3c0b9ca15b..17d854a74f4d 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -772,9 +772,9 @@ def elapsed_us(self): class FunctionEvent(FormattedTimesMixin): """Profiling information about a single function.""" def __init__( - self, id, node_id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None, + self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None, stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, - is_remote=True, sequence_nr=-1): + is_remote=False, sequence_nr=-1, node_id=0): self.id: int = id self.node_id: int = node_id self.name: str = name @@ -1005,35 +1005,25 @@ def parse_kineto_results(result): assert start_record is not None, "Invalid profiler output, __start_profile is missing" # Create and return FunctionEvent list + string_table = StringTable() function_events = [] for kineto_event in result.events(): + fe_start_us = kineto_event.start_us() - start_record.start_us() fe = FunctionEvent( - id=record.handle(), - node_id=record.node_id(), - name=string_table[start.name()], - thread=start.thread_id(), - start_us=start_record.cpu_elapsed_us(start), - end_us=start_record.cpu_elapsed_us(record), - fwd_thread=start.fwd_thread_id(), - input_shapes=start.shapes(), - stack=[entry for entry in start.stack() if filter_stack_entry(entry)], - scope=start.scope(), - cpu_memory_usage=cpu_memory_usage, - cuda_memory_usage=cuda_memory_usage, - is_async=is_async, - is_remote=is_remote_event, - sequence_nr=start.sequence_nr(), + id=kineto_event.correlation_id(), + name=string_table[kineto_event.name()], + thread=kineto_event.start_thread_id(), + start_us=fe_start_us, + end_us=fe_start_us + kineto_event.duration_us(), + fwd_thread=kineto_event.fwd_thread_id(), + input_shapes=kineto_event.shapes(), + stack=[entry for entry in kineto_event.stack() if filter_stack_entry(entry)], + scope=kineto_event.scope(), + #cpu_memory_usage=cpu_memory_usage, + #cuda_memory_usage=cuda_memory_usage, + is_async=kineto_event.start_thread_id() != kineto_event.end_thread_id(), + sequence_nr=kineto_event.sequence_nr(), ) - # note: async events have only cpu total time - if not is_async and start.has_cuda(): - cuda_start = adjusted_time(start, cuda_records) - cuda_end = adjusted_time(record, cuda_records) - if (cuda_end - cuda_start) > 0: - fe.append_kernel( - start.name(), - start.device(), - cuda_start, - cuda_end) function_events.append(fe) function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end]) return function_events diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 26b7e47742b3..647683cc19da 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -67,42 +67,67 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .def("sequence_nr", &LegacyEvent::sequenceNr) .def("stack", &LegacyEvent::stack) .def("scope", &LegacyEvent::scope) - .def("correlation_id", &LegacyEvent::correlationId); + .def("correlation_id", &LegacyEvent::correlationId) + .def("start_us", &LegacyEvent::cpuUs); #ifdef USE_KINETO py::class_(m, "KinetoEvent") + // name of the event .def("name", &KinetoEvent::name) + // start callback PyTorch thread id .def("start_thread_id", [](const KinetoEvent& e) { return e.startThreadId(); }) + // end callback PyTorch thread id .def("end_thread_id", [](const KinetoEvent& e) { return e.endThreadId(); }) - .def("device_index", &KinetoEvent::deviceIndex) - .def("device_resource_id", &KinetoEvent::deviceResourceId) + // for events of scope BACKWARD_FUNCTION - PyTorch thread id + // of the corresponding forward op + .def("fwd_thread_id", [](const KinetoEvent& e) { + return e.fwdThreadId(); + }) + // together with fwd_thread_id, used to uniquely identify + // the forward op + .def("sequence_nr", [](const KinetoEvent& e) { + return e.sequenceNr(); + }) + // absolute start time (since unix epoch) in us .def("start_us", &KinetoEvent::startUs) + // duration in us .def("duration_us", &KinetoEvent::durationUs) + // used to correlate between high-level PyTorch events + // and low-level device events .def("correlation_id", [](const KinetoEvent& e) { return e.correlationId(); }) - .def("fwd_thread_id", [](const KinetoEvent& e) { - return e.fwdThreadId(); - }) + // shapes of input tensors .def("shapes", [](const KinetoEvent& e) { - return e.shapes(); - }) - .def("sequence_nr", [](const KinetoEvent& e) { - return e.sequenceNr(); + if (e.hasShapes()) { + return e.shapes(); + } else { + return std::vector>(); + } }) + // stack traces of the PyTorch CPU events .def("stack", [](const KinetoEvent& e) { - return e.stack(); + if (e.hasStack()) { + return e.stack(); + } else { + return std::vector(); + } }) + // type of the RecordFunction that generated this PyTorch CPU event + // (op, torchscript function, user label, etc) .def("scope", [](const KinetoEvent& e) { return e.scope(); }) - .def("activity_type", [](const KinetoEvent& e) { - return e.activityType(); - }); + // device number, for CPU - process id + .def("device_index", &KinetoEvent::deviceIndex) + // for CUDA - stream id, for CPU - start thread id + .def("device_resource_id", &KinetoEvent::deviceResourceId) + // device type, currently: CPU or CUDA + .def("device_type", &KinetoEvent::deviceType); py::class_(m, "ProfilerResult") .def("events", [](const ProfilerResultWrapper& r) { diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index e855674bc860..721a698bacaf 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -282,6 +282,24 @@ KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) { return *this; } +c10::DeviceType KinetoEvent::deviceType() const { + switch (activity_type_) { + case (uint8_t)libkineto::ActivityType::CPU_OP: + return c10::DeviceType::CPU; + case (uint8_t)libkineto::ActivityType::GPU_MEMCPY: + return c10::DeviceType::CUDA; + case (uint8_t)libkineto::ActivityType::GPU_MEMSET: + return c10::DeviceType::CUDA; + case (uint8_t)libkineto::ActivityType::CONCURRENT_KERNEL: + return c10::DeviceType::CUDA; + case (uint8_t)libkineto::ActivityType::EXTERNAL_CORRELATION: + return c10::DeviceType::CPU; + case (uint8_t)libkineto::ActivityType::CUDA_RUNTIME: + return c10::DeviceType::CPU; + } + TORCH_CHECK(false, "Unknown activity type"); +} + KinetoEvent::KinetoEvent() : activity_type_((uint8_t)libkineto::ActivityType::CPU_OP) {} ProfilerResult::ProfilerResult( diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index c4e08834af0e..998667da6d97 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -144,6 +144,8 @@ struct TORCH_API KinetoEvent { return device_resource_id_; } + c10::DeviceType deviceType() const; + uint64_t start_thread_id_ = 0; uint64_t end_thread_id_ = 0; uint64_t fwd_thread_id_ = 0; From 165bb7c75613e25793e2cb90c483b88b0e6a3177 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Wed, 11 Nov 2020 14:19:52 -0800 Subject: [PATCH 46/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/csrc/autograd/init.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 647683cc19da..6afdab010b76 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -74,11 +74,11 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { py::class_(m, "KinetoEvent") // name of the event .def("name", &KinetoEvent::name) - // start callback PyTorch thread id + // PyTorch thread id of the start callback .def("start_thread_id", [](const KinetoEvent& e) { return e.startThreadId(); }) - // end callback PyTorch thread id + // PyTorch thread id of the end callback .def("end_thread_id", [](const KinetoEvent& e) { return e.endThreadId(); }) @@ -96,7 +96,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .def("start_us", &KinetoEvent::startUs) // duration in us .def("duration_us", &KinetoEvent::durationUs) - // used to correlate between high-level PyTorch events + // used for correlation between high-level PyTorch events // and low-level device events .def("correlation_id", [](const KinetoEvent& e) { return e.correlationId(); @@ -117,7 +117,7 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { return std::vector(); } }) - // type of the RecordFunction that generated this PyTorch CPU event + // type of the RecordFunction that generated a PyTorch CPU event // (op, torchscript function, user label, etc) .def("scope", [](const KinetoEvent& e) { return e.scope(); From 445b8c1300435b1f14501694ec769715802699ea Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Wed, 11 Nov 2020 16:06:58 -0800 Subject: [PATCH 47/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/profiler.py | 80 ++++++++++++++++++++++++++++-------- torch/csrc/autograd/init.cpp | 4 +- 2 files changed, 67 insertions(+), 17 deletions(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 17d854a74f4d..24b88c41865e 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -62,7 +62,7 @@ def populate_cpu_children(self): # Some events can be async (i.e. start and end on different threads), # since it's generally undefined how to attribute children ranges to # async ranges, we do not use them when calculating nested ranges and stats - sync_events = [evt for evt in self if not evt.is_async] + sync_events = [evt for evt in self if not evt.is_async and evt.device_type == 0] events = sorted( sync_events, key=attrgetter("thread"), @@ -774,7 +774,7 @@ class FunctionEvent(FormattedTimesMixin): def __init__( self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None, stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, - is_remote=False, sequence_nr=-1, node_id=0): + is_remote=False, sequence_nr=-1, node_id=0, device_type=0): self.id: int = id self.node_id: int = node_id self.name: str = name @@ -793,8 +793,10 @@ def __init__( self.is_async: bool = is_async self.is_remote: bool = is_remote self.sequence_nr: int = sequence_nr + self.device_type: int = device_type def append_kernel(self, name, device, start, end): + assert self.device_type == 0 # CPU self.kernels.append(Kernel(name, device, Interval(start, end))) def append_cpu_child(self, child): @@ -803,7 +805,9 @@ def append_cpu_child(self, child): One is supposed to append only direct children to the event to have correct self cpu time being reported. """ + assert(self.device_type == 0) # CPU assert(isinstance(child, FunctionEvent)) + assert(child.device_type == 0) self.cpu_children.append(child) def set_cpu_parent(self, parent): @@ -813,14 +817,16 @@ def set_cpu_parent(self, parent): the child's range interval is completely inside the parent's. We use this connection to determine the event is from top-level op or not. """ + assert(self.device_type == 0) # CPU assert(isinstance(parent, FunctionEvent)) + assert(parent.device_type == 0) self.cpu_parent = parent # Note: async events don't have children, are not used when computing 'self' # metrics of other events, have only total cpu time @property def self_cpu_memory_usage(self): - if self.is_async: + if self.is_async or self.device_type != 0: # CPU return 0 return self.cpu_memory_usage - sum( [child.cpu_memory_usage for child in self.cpu_children] @@ -828,7 +834,7 @@ def self_cpu_memory_usage(self): @property def self_cuda_memory_usage(self): - if self.is_async: + if self.is_async or self.device_type != 0: # CPU return 0 return self.cuda_memory_usage - sum( [child.cuda_memory_usage for child in self.cpu_children] @@ -836,7 +842,7 @@ def self_cuda_memory_usage(self): @property def self_cpu_time_total(self): - if self.is_async: + if self.is_async or self.device_type != 0: return 0 return self.cpu_time_total - sum( [child.cpu_time_total for child in self.cpu_children] @@ -844,16 +850,31 @@ def self_cpu_time_total(self): @property def cuda_time_total(self): - return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) + if self.is_async: + return 0 + if self.device_type == 0: # CPU + return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) + else: + assert self.device_type == 1 # CUDA + return self.time_range.elapsed_us() @property def self_cuda_time_total(self): - return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) - \ - sum([child.cuda_time_total for child in self.cpu_children]) + if self.is_async: + return 0 + if self.device_type == 0: # CPU + return self.cuda_time_total - \ + sum([child.cuda_time_total for child in self.cpu_children]) + else: + assert(self.device_type == 1) # CUDA + return self.cuda_time_total @property def cpu_time_total(self): - return self.time_range.elapsed_us() + if self.device_type == 0: # CPU + return self.time_range.elapsed_us() + else: + return 0 @property def key(self): @@ -861,10 +882,12 @@ def key(self): def __repr__(self): return ( - ''.format( self.id, + self.name, + self.device_type, self.node_id, self.cpu_time_str, self.time_range.start, @@ -1008,22 +1031,46 @@ def parse_kineto_results(result): string_table = StringTable() function_events = [] for kineto_event in result.events(): - fe_start_us = kineto_event.start_us() - start_record.start_us() + rel_start_us = kineto_event.start_us() - start_record.start_us() + rel_end_us = rel_start_us + kineto_event.duration_us() + abs_end_us = kineto_event.start_us() + kineto_event.duration_us() + + cpu_memory_usage = 0 + cuda_memory_usage = 0 + if kineto_event.device_type() == 0: # CPU + # find the corresponding memory allocation events + for mem_record in mem_records: + if (mem_record.start_us() >= kineto_event.start_us() and + mem_record.start_us() <= abs_end_us): + cpu_memory_usage += mem_record.cpu_memory_usage() + cuda_memory_usage += mem_record.cuda_memory_usage() + is_async = kineto_event.start_thread_id() != kineto_event.end_thread_id() fe = FunctionEvent( id=kineto_event.correlation_id(), name=string_table[kineto_event.name()], thread=kineto_event.start_thread_id(), - start_us=fe_start_us, - end_us=fe_start_us + kineto_event.duration_us(), + start_us=rel_start_us, + end_us=rel_end_us, fwd_thread=kineto_event.fwd_thread_id(), input_shapes=kineto_event.shapes(), stack=[entry for entry in kineto_event.stack() if filter_stack_entry(entry)], scope=kineto_event.scope(), - #cpu_memory_usage=cpu_memory_usage, - #cuda_memory_usage=cuda_memory_usage, - is_async=kineto_event.start_thread_id() != kineto_event.end_thread_id(), + cpu_memory_usage=cpu_memory_usage, + cuda_memory_usage=cuda_memory_usage, + is_async=is_async, sequence_nr=kineto_event.sequence_nr(), + device_type=kineto_event.device_type(), ) + # associate CUDA kernels with a CPU event + if kineto_event.device_type() == 0 and not is_async: + for evt in result.events(): + if evt.device_type == 1: # CUDA + if evt.correlation_id == kineto_event.correlation_id: + fe.append_kernel( + evt.name(), + evt.device_index(), + evt.start_us(), + evt.start_us() + evt.duration_us()) function_events.append(fe) function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end]) return function_events @@ -1132,6 +1179,7 @@ def adjusted_time(cuda_record, cuda_records_map): is_async=is_async, is_remote=is_remote_event, sequence_nr=start.sequence_nr(), + device_type=0, # CPU ) # note: async events have only cpu total time if not is_async and start.has_cuda(): diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 6afdab010b76..b187a4f8524c 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -127,7 +127,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { // for CUDA - stream id, for CPU - start thread id .def("device_resource_id", &KinetoEvent::deviceResourceId) // device type, currently: CPU or CUDA - .def("device_type", &KinetoEvent::deviceType); + .def("device_type", [](const KinetoEvent& e) { + return (uint8_t)e.deviceType(); + }); py::class_(m, "ProfilerResult") .def("events", [](const ProfilerResultWrapper& r) { From 7c317f5cd49b980c321ce36ec73bcc936051c316 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Wed, 11 Nov 2020 17:13:26 -0800 Subject: [PATCH 48/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- test/test_autograd.py | 57 ++++++++++++------------- test/test_profiler.py | 5 ++- torch/autograd/profiler.py | 30 ++++++------- torch/csrc/autograd/profiler_kineto.cpp | 1 + 4 files changed, 46 insertions(+), 47 deletions(-) diff --git a/test/test_autograd.py b/test/test_autograd.py index 177a9b4c7805..365c72fff471 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -33,7 +33,7 @@ suppress_warnings, slowTest, load_tests, random_symmetric_matrix, IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck) -from torch.autograd import Variable, Function, detect_anomaly +from torch.autograd import Variable, Function, detect_anomaly, kineto_available from torch.autograd.function import InplaceFunction from torch.testing import randn_like from torch.testing._internal.common_methods_invocations import (method_tests, @@ -2989,7 +2989,7 @@ def gen_matrices(p): https://github.com/pytorch/pytorch/issues/34086""") def test_profiler_tracing(self): t1, t2 = torch.ones(1), torch.ones(1) - with torch.autograd.profiler.profile() as prof: + with torch.autograd.profiler.profile(use_kineto=kineto_available()) as prof: torch.add(t1, t2) with tempfile.NamedTemporaryFile(mode="w+") as f: @@ -3004,7 +3004,7 @@ def test_profiler_tracing(self): device = torch.device("cuda:0") t1, t2 = torch.ones(1, device=device), torch.ones(1, device=device) - with torch.autograd.profiler.profile(use_cuda=True) as prof: + with torch.autograd.profiler.profile(use_cuda=True, use_kineto=kineto_available()) as prof: torch.add(t1, t2) with tempfile.NamedTemporaryFile(mode="w+") as f: @@ -3015,7 +3015,7 @@ def test_profiler_tracing(self): def test_profiler(self): x = torch.randn(10, 10) - with profile() as p: + with profile(use_kineto=kineto_available()) as p: self.assertTrue(torch.autograd._profiler_enabled()) y = x * 2 + 4 @@ -3026,17 +3026,14 @@ def test_profiler(self): 'aten::empty', 'aten::add', 'aten::to', 'aten::empty_strided', 'aten::copy_', 'aten::empty'] top_level_names = ['aten::mul', 'aten::add'] - top_level_iter = iter(top_level_names) - self.assertEqual(len(p.function_events), len(names)) - for info, expected_name in zip(p.function_events, names): - if info.cpu_interval.start > last_end: - top_level_name_expected = next(top_level_iter) - self.assertEqual(info.name, top_level_name_expected) - last_end = info.cpu_interval.end - self.assertEqual(info.name, expected_name) + for evt in p.function_events: + if evt.time_range.start > last_end: + self.assertTrue(evt.name in top_level_names) + last_end = evt.time_range.end + self.assertTrue(evt.name in names) def test_profiler_seq_nr(self): - with profile() as p: + with profile(use_kineto=kineto_available()) as p: x = torch.randn(10, 10, requires_grad=True) y = torch.randn(10, 10, requires_grad=True) z = x + y @@ -3084,7 +3081,7 @@ def test_profiler_seq_nr(self): def test_profiler_unboxed_only(self): x = torch.rand(3, 4) - with torch.autograd.profiler.profile() as prof: + with torch.autograd.profiler.profile(use_kineto=kineto_available()) as prof: x.resize_([3, 2]) def test_profiler_propagation(self): @@ -3109,7 +3106,7 @@ def bar(x): traced_bar = torch.jit.trace(bar, x) - with profile() as p: + with profile(use_kineto=kineto_available()) as p: traced_bar(x) found_foo = False @@ -3131,7 +3128,7 @@ def bar(x): def test_record_function_callbacks(self): x = torch.randn(10, 10) - with profile() as p: + with profile(use_kineto=kineto_available()) as p: with record_function("foo"): y = x * 2 + 4 @@ -3163,8 +3160,8 @@ def get_id(): node_id=0, name="", thread=thread, - cpu_start=range[0], - cpu_end=range[1], + start_us=range[0], + end_us=range[1], ) ) @@ -3187,7 +3184,7 @@ def test_profiler_aggregation_table(self): """ x = torch.randn(1024) - with torch.autograd.profiler.profile() as prof: + with torch.autograd.profiler.profile(use_kineto=kineto_available()) as prof: torch.einsum("i->", x) prof_str = str(prof) @@ -3197,8 +3194,8 @@ def test_profiler_aggregation_table(self): def test_profiler_function_event_avg(self): avg = FunctionEventAvg() - avg.add(FunctionEvent(id=0, node_id=0, name="foo", thread=0, cpu_start=10, cpu_end=15)) - avg.add(FunctionEvent(id=1, node_id=0, name="foo", thread=0, cpu_start=20, cpu_end=30)) + avg.add(FunctionEvent(id=0, node_id=0, name="foo", thread=0, start_us=10, end_us=15)) + avg.add(FunctionEvent(id=1, node_id=0, name="foo", thread=0, start_us=20, end_us=30)) avg.add(avg) self.assertEqual(avg.key, "foo") @@ -3217,7 +3214,7 @@ def test_profiler_shapes(self): layer1 = torch.nn.Linear(20, 30) layer2 = torch.nn.Linear(30, 40) input = torch.randn(128, 20) - with profile(record_shapes=True) as prof: + with profile(record_shapes=True, use_kineto=kineto_available()) as prof: layer2(layer1(input)) print(prof.function_events) @@ -3233,18 +3230,18 @@ def test_profiler_shapes(self): last_end = 0 for event in prof.function_events: - if event.cpu_interval.start > last_end: + if event.time_range.start > last_end: name_expected, input_shape_expected = next(expected_iter) if name_expected is not None: self.assertEqual(event.name, name_expected) self.assertEqual(event.input_shapes, input_shape_expected) - last_end = event.cpu_interval.end + last_end = event.time_range.end def test_profiler_no_cuda(self): print("") layer = torch.nn.Linear(20, 30) x = torch.randn(128, 20) - with profile(use_cuda=False) as prof: + with profile(use_cuda=False, use_kineto=kineto_available()) as prof: layer(x) prof_str = str(prof) @@ -3256,7 +3253,7 @@ def test_profiler_aggregation_lstm(self): print("") rnn = torch.nn.LSTM(10, 20, 2) total_time_s = 0 - with profile(record_shapes=True) as prof: + with profile(record_shapes=True, use_kineto=kineto_available()) as prof: for i in range(20): input = torch.randn(5, 3, 10) h = torch.randn(2, 3, 20) @@ -3293,7 +3290,7 @@ def test_profiler_aggregation_lstm(self): def test_memory_profiler(self): def run_profiler(tensor_creation_fn, metric): # collecting allocs / deallocs - with profile(profile_memory=True, record_shapes=True) as prof: + with profile(profile_memory=True, record_shapes=True, use_kineto=kineto_available()) as prof: x = None with record_function("test_user_scope_alloc"): x = tensor_creation_fn() @@ -3385,7 +3382,7 @@ def create_mkldnn_tensor(): # check partial overlap of tensor allocation with memory profiler x = torch.rand(10, 10) - with profile(profile_memory=True, record_shapes=True) as prof: + with profile(profile_memory=True, record_shapes=True, use_kineto=kineto_available()) as prof: del x x = torch.rand(10, 10) del x @@ -3411,7 +3408,7 @@ def forward(x): forward(x) - with profile() as p: + with profile(use_kineto=kineto_available()) as p: forward(x) events = p.function_events @@ -3436,7 +3433,7 @@ def forward(x): def f(x, y): return x + y - with profile() as p: + with profile(use_kineto=kineto_available()) as p: f(1, 2) self.assertTrue('my_func' in str(p)) diff --git a/test/test_profiler.py b/test/test_profiler.py index 44973546429e..6d7618ca8a0b 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -7,6 +7,7 @@ from torch.testing._internal.common_utils import ( TestCase, run_tests, TEST_WITH_ASAN, IS_WINDOWS) from torch.autograd.profiler import profile +from torch.autograd import kineto_available try: import psutil @@ -73,7 +74,7 @@ def forward(self, x): mod = DummyModule() - with profile(with_stack=True) as p: + with profile(with_stack=True, use_kineto=kineto_available()) as p: x = torch.randn(10, 10, requires_grad=True) y = torch.randn(10, 10, requires_grad=True) z = x + y @@ -99,7 +100,7 @@ def forward(self, x): torch._C._set_graph_executor_optimize(prev_opt) - @unittest.skipIf(not torch.autograd.kineto_available(), "Kineto is required") + @unittest.skipIf(not kineto_available(), "Kineto is required") @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required") def test_kineto(self): x = torch.randn(10, 10).cuda() diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 24b88c41865e..04858f94deca 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -6,7 +6,7 @@ from collections import defaultdict, namedtuple from operator import attrgetter -from typing import List, Dict, Tuple, Optional +from typing import List, Tuple, Optional try: # Available in Python >= 3.2 @@ -120,7 +120,7 @@ def set_backward_stacktraces(self): def bw_parent(evt): if evt is None: return None - elif evt.scope == 1: # BACKWARD_FUNCTION + elif evt.scope == 1: # BACKWARD_FUNCTION return evt else: return bw_parent(evt.cpu_parent) @@ -671,7 +671,7 @@ def __enter__(self): raise RuntimeError("NVTX annotation context manager is not reentrant") self.entered = True torch.cuda.synchronize() - torch.autograd._enable_profiler( + torch.autograd._enable_profiler_legacy( torch.autograd.ProfilerConfig( torch.autograd.ProfilerState.NVTX, self.record_shapes, @@ -684,7 +684,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): if not self.enabled: return torch.cuda.synchronize() - torch.autograd._disable_profiler() + torch.autograd._disable_profiler_legacy() return False @@ -796,7 +796,7 @@ def __init__( self.device_type: int = device_type def append_kernel(self, name, device, start, end): - assert self.device_type == 0 # CPU + assert self.device_type == 0 # CPU self.kernels.append(Kernel(name, device, Interval(start, end))) def append_cpu_child(self, child): @@ -805,7 +805,7 @@ def append_cpu_child(self, child): One is supposed to append only direct children to the event to have correct self cpu time being reported. """ - assert(self.device_type == 0) # CPU + assert(self.device_type == 0) # CPU assert(isinstance(child, FunctionEvent)) assert(child.device_type == 0) self.cpu_children.append(child) @@ -817,7 +817,7 @@ def set_cpu_parent(self, parent): the child's range interval is completely inside the parent's. We use this connection to determine the event is from top-level op or not. """ - assert(self.device_type == 0) # CPU + assert(self.device_type == 0) # CPU assert(isinstance(parent, FunctionEvent)) assert(parent.device_type == 0) self.cpu_parent = parent @@ -826,7 +826,7 @@ def set_cpu_parent(self, parent): # metrics of other events, have only total cpu time @property def self_cpu_memory_usage(self): - if self.is_async or self.device_type != 0: # CPU + if self.is_async or self.device_type != 0: # CPU return 0 return self.cpu_memory_usage - sum( [child.cpu_memory_usage for child in self.cpu_children] @@ -834,7 +834,7 @@ def self_cpu_memory_usage(self): @property def self_cuda_memory_usage(self): - if self.is_async or self.device_type != 0: # CPU + if self.is_async or self.device_type != 0: # CPU return 0 return self.cuda_memory_usage - sum( [child.cuda_memory_usage for child in self.cpu_children] @@ -852,26 +852,26 @@ def self_cpu_time_total(self): def cuda_time_total(self): if self.is_async: return 0 - if self.device_type == 0: # CPU + if self.device_type == 0: # CPU return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) else: - assert self.device_type == 1 # CUDA + assert self.device_type == 1 # CUDA return self.time_range.elapsed_us() @property def self_cuda_time_total(self): if self.is_async: return 0 - if self.device_type == 0: # CPU + if self.device_type == 0: # CPU return self.cuda_time_total - \ sum([child.cuda_time_total for child in self.cpu_children]) else: - assert(self.device_type == 1) # CUDA + assert(self.device_type == 1) # CUDA return self.cuda_time_total @property def cpu_time_total(self): - if self.device_type == 0: # CPU + if self.device_type == 0: # CPU return self.time_range.elapsed_us() else: return 0 @@ -1179,7 +1179,7 @@ def adjusted_time(cuda_record, cuda_records_map): is_async=is_async, is_remote=is_remote_event, sequence_nr=start.sequence_nr(), - device_type=0, # CPU + device_type=0, # CPU ) # note: async events have only cpu total time if not is_async and start.has_cuda(): diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 721a698bacaf..9c1340a9a7e4 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -193,6 +193,7 @@ void prepareProfiler( libkineto::ActivityType::GPU_MEMCPY, libkineto::ActivityType::GPU_MEMSET, libkineto::ActivityType::CONCURRENT_KERNEL, + // also including CUDA_RUNTIME libkineto::ActivityType::CUDA_RUNTIME, }; From c904443bde4dc227e1aa0ca13b5aab6ed7d62c32 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Wed, 11 Nov 2020 19:15:38 -0800 Subject: [PATCH 49/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls Node ID ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 11.000us 64.71% 11.000us 11.000us 1 0 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.000us 17.65% 3.000us 3.000us 1 0 Memcpy DtoH (Device -> Pageable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 11.76% 2.000us 2.000us 1 0 Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 5.88% 1.000us 1.000us 1 0 aten::mm 13.86% 421.014ms 27.73% 842.019ms 421.010ms 0.000us 0.00% 0.000us 0.000us 2 0 aten::empty 0.00% 25.000us 0.00% 25.000us 12.500us 0.000us 0.00% 0.000us 0.000us 2 0 aten::stride 0.00% 0.000us 0.00% 0.000us 0.000us 0.000us 0.00% 0.000us 0.000us 3 0 aten::add 36.55% 1.110s 73.11% 2.220s 1.110s 0.000us 0.00% 0.000us 0.000us 2 0 aten::to 0.00% 9.000us 0.00% 99.000us 99.000us 0.000us 0.00% 0.000us 0.000us 1 0 aten::empty_strided 0.00% 21.000us 0.00% 21.000us 21.000us 0.000us 0.00% 0.000us 0.000us 1 0 aten::copy_ 0.00% 69.000us 0.00% 133.000us 66.500us 0.000us 0.00% 0.000us 0.000us 2 0 cudaFree 13.00% 394.907ms 13.00% 394.907ms 394.907ms 0.000us 0.00% 0.000us 0.000us 1 0 cudaDeviceGetAttribute 0.00% 1.000us 0.00% 1.000us 0.091us 0.000us 0.00% 0.000us 0.000us 11 0 cudaMalloc 0.02% 632.000us 0.02% 632.000us 210.667us 0.000us 0.00% 0.000us 0.000us 3 0 cudaMemcpy 0.00% 20.000us 0.00% 20.000us 20.000us 0.000us 0.00% 0.000us 0.000us 1 0 cudaEventCreateWithFlags 0.00% 9.000us 0.00% 9.000us 0.562us 0.000us 0.00% 0.000us 0.000us 16 0 cudaLaunchKernel 36.55% 1.110s 36.55% 1.110s 555.021ms 0.000us 0.00% 0.000us 0.000us 2 0 cudaMemcpyAsync 0.00% 33.000us 0.00% 33.000us 33.000us 0.000us 0.00% 0.000us 0.000us 1 0 cudaStreamSynchronize 0.00% 4.000us 0.00% 4.000us 4.000us 0.000us 0.00% 0.000us 0.000us 1 0 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/profiler.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 04858f94deca..fb6c8dfc0bb2 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -1027,6 +1027,13 @@ def parse_kineto_results(result): mem_records.append(record) assert start_record is not None, "Invalid profiler output, __start_profile is missing" + cuda_corr_map = {} + for kineto_event in result.events(): + if kineto_event.device_type() == 1: # CUDA + if kineto_event.correlation_id() not in cuda_corr_map: + cuda_corr_map[kineto_event.correlation_id()] = [] + cuda_corr_map[kineto_event.correlation_id()].append(kineto_event) + # Create and return FunctionEvent list string_table = StringTable() function_events = [] @@ -1062,15 +1069,14 @@ def parse_kineto_results(result): device_type=kineto_event.device_type(), ) # associate CUDA kernels with a CPU event - if kineto_event.device_type() == 0 and not is_async: - for evt in result.events(): - if evt.device_type == 1: # CUDA - if evt.correlation_id == kineto_event.correlation_id: - fe.append_kernel( - evt.name(), - evt.device_index(), - evt.start_us(), - evt.start_us() + evt.duration_us()) + if (kineto_event.device_type() == 0 and not is_async and + kineto_event.correlation_id() in cuda_corr_map): + for evt in cuda_corr_map[kineto_event.correlation_id()]: + fe.append_kernel( + evt.name(), + evt.device_index(), + evt.start_us(), + evt.start_us() + evt.duration_us()) function_events.append(fe) function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end]) return function_events From 1f600f84fac69e36666fb4d04b2e06d31628c9ca Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Wed, 11 Nov 2020 23:26:04 -0800 Subject: [PATCH 50/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls Node ID ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 11.000us 64.71% 11.000us 11.000us 1 0 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.000us 17.65% 3.000us 3.000us 1 0 Memcpy DtoH (Device -> Pageable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 11.76% 2.000us 2.000us 1 0 Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 5.88% 1.000us 1.000us 1 0 aten::mm 13.86% 421.014ms 27.73% 842.019ms 421.010ms 0.000us 0.00% 0.000us 0.000us 2 0 aten::empty 0.00% 25.000us 0.00% 25.000us 12.500us 0.000us 0.00% 0.000us 0.000us 2 0 aten::stride 0.00% 0.000us 0.00% 0.000us 0.000us 0.000us 0.00% 0.000us 0.000us 3 0 aten::add 36.55% 1.110s 73.11% 2.220s 1.110s 0.000us 0.00% 0.000us 0.000us 2 0 aten::to 0.00% 9.000us 0.00% 99.000us 99.000us 0.000us 0.00% 0.000us 0.000us 1 0 aten::empty_strided 0.00% 21.000us 0.00% 21.000us 21.000us 0.000us 0.00% 0.000us 0.000us 1 0 aten::copy_ 0.00% 69.000us 0.00% 133.000us 66.500us 0.000us 0.00% 0.000us 0.000us 2 0 cudaFree 13.00% 394.907ms 13.00% 394.907ms 394.907ms 0.000us 0.00% 0.000us 0.000us 1 0 cudaDeviceGetAttribute 0.00% 1.000us 0.00% 1.000us 0.091us 0.000us 0.00% 0.000us 0.000us 11 0 cudaMalloc 0.02% 632.000us 0.02% 632.000us 210.667us 0.000us 0.00% 0.000us 0.000us 3 0 cudaMemcpy 0.00% 20.000us 0.00% 20.000us 20.000us 0.000us 0.00% 0.000us 0.000us 1 0 cudaEventCreateWithFlags 0.00% 9.000us 0.00% 9.000us 0.562us 0.000us 0.00% 0.000us 0.000us 16 0 cudaLaunchKernel 36.55% 1.110s 36.55% 1.110s 555.021ms 0.000us 0.00% 0.000us 0.000us 2 0 cudaMemcpyAsync 0.00% 33.000us 0.00% 33.000us 33.000us 0.000us 0.00% 0.000us 0.000us 1 0 cudaStreamSynchronize 0.00% 4.000us 0.00% 4.000us 4.000us 0.000us 0.00% 0.000us 0.000us 1 0 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- test/test_autograd.py | 2 + torch/autograd/profiler.py | 90 ++++++++++++++++----------- torch/csrc/autograd/profiler_kineto.h | 4 +- 3 files changed, 57 insertions(+), 39 deletions(-) diff --git a/test/test_autograd.py b/test/test_autograd.py index 365c72fff471..e3e0c4f5fa7d 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -3039,6 +3039,8 @@ def test_profiler_seq_nr(self): z = x + y s = z.sum() s.backward() + print(p.key_averages().table( + sort_by="self_cpu_time_total", row_limit=-1)) # expecting aten::add, aten::sum to have the sequence numbers, # expecting the corresponding backward nodes to have the same numbers # as the forward ops diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index fb6c8dfc0bb2..db6abfabccc8 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -37,14 +37,57 @@ def __init__(self, *args, **kwargs): use_cuda = kwargs.pop('use_cuda', True) profile_memory = kwargs.pop('profile_memory', False) super(EventList, self).__init__(*args, **kwargs) - self._cpu_children_populated = False self._use_cuda = use_cuda self._profile_memory = profile_memory + self._tree_built = False + + def build_tree(self): + self._populate_cpu_children() + self._remove_dup_nodes() + self._set_kernels() + self._set_backward_stacktraces() + self._tree_built = True def __str__(self): return self.table() - def populate_cpu_children(self): + def _remove_dup_nodes(self): + while True: + to_delete = [] + for idx in range(len(self)): + if (self[idx].cpu_parent is not None and + self[idx].cpu_parent.name == self[idx].name and + len(self[idx].cpu_parent.cpu_children) == 1): + self[idx].cpu_parent.cpu_children += self[idx].cpu_children + for ch in self[idx].cpu_children: + ch.cpu_parent = self[idx].cpu_parent + to_delete.append(idx) + if len(to_delete) == 0: + break + new_evts = [ev for ind, ev in enumerate(self) if ind not in to_delete] + self.clear() + self.extend(new_evts) + + def _set_kernels(self): + # associate CUDA kernels with CPU events + cuda_corr_map = {} + for evt in self: + if evt.device_type == 1: # CUDA + if evt.id not in cuda_corr_map: + cuda_corr_map[evt.id] = [] + cuda_corr_map[evt.id].append(evt) + + for evt in self: + if (evt.device_type == 0 and not evt.is_async and + evt.id in cuda_corr_map): + for k_evt in cuda_corr_map[evt.id]: + evt.append_kernel( + k_evt.name(), + k_evt.device_index(), + k_evt.start_us(), + k_evt.start_us() + k_evt.duration_us()) + + def _populate_cpu_children(self): """Populates child events into each underlying FunctionEvent object. One event is a child of another if [s1, e1) is inside [s2, e2). Where s1 and e1 would be start and end of the child event's interval. And @@ -56,8 +99,6 @@ def populate_cpu_children(self): If for any reason two intervals intersect only partially, this function will not record a parent child relationship between then. """ - if self.cpu_children_populated: - return # Some events can be async (i.e. start and end on different threads), # since it's generally undefined how to attribute children ranges to @@ -112,11 +153,7 @@ def populate_cpu_children(self): current_events.append(event) - self._cpu_children_populated = True - - def set_backward_stacktraces(self): - self.populate_cpu_children() - + def _set_backward_stacktraces(self): def bw_parent(evt): if evt is None: return None @@ -127,7 +164,7 @@ def bw_parent(evt): fwd_stacks = {} for evt in self: - if bw_parent(evt) is None: + if bw_parent(evt) is None and evt.stack is not None: t = (evt.sequence_nr, evt.thread) if t not in fwd_stacks: fwd_stacks[t] = evt.stack @@ -142,15 +179,10 @@ def bw_parent(evt): else: evt.stack = [] - @property def self_cpu_time_total(self): return sum([event.self_cpu_time_total for event in self]) - @property - def cpu_children_populated(self): - return self._cpu_children_populated - def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=None, top_level_events_only=False): """Prints an EventList as a nicely formatted table. @@ -262,7 +294,7 @@ def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0): Returns: An EventList containing FunctionEventAvg objects. """ - self.populate_cpu_children() + assert self._tree_built stats = defaultdict(FunctionEventAvg) def get_key(event, group_by_input_shapes, group_by_stack_n): @@ -442,8 +474,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): parsed_results, use_cuda=self.use_cuda, profile_memory=self.profile_memory) - if self.with_stack: - self.function_events.set_backward_stacktraces() + self.function_events.build_tree() return False def __repr__(self): @@ -454,13 +485,11 @@ def __repr__(self): def __str__(self): if self.function_events is None: return '' - self.function_events.populate_cpu_children() return str(self.function_events) def _check_finish(self): if self.function_events is None: raise RuntimeError("can't export a trace that didn't finish running") - self.function_events.populate_cpu_children() def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=None, top_level_events_only=False): self._check_finish() @@ -774,7 +803,7 @@ class FunctionEvent(FormattedTimesMixin): def __init__( self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None, stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, - is_remote=False, sequence_nr=-1, node_id=0, device_type=0): + is_remote=False, sequence_nr=-1, node_id=-1, device_type=0): self.id: int = id self.node_id: int = node_id self.name: str = name @@ -1027,17 +1056,12 @@ def parse_kineto_results(result): mem_records.append(record) assert start_record is not None, "Invalid profiler output, __start_profile is missing" - cuda_corr_map = {} - for kineto_event in result.events(): - if kineto_event.device_type() == 1: # CUDA - if kineto_event.correlation_id() not in cuda_corr_map: - cuda_corr_map[kineto_event.correlation_id()] = [] - cuda_corr_map[kineto_event.correlation_id()].append(kineto_event) - # Create and return FunctionEvent list string_table = StringTable() function_events = [] for kineto_event in result.events(): + if filter_name(kineto_event.name()): + continue rel_start_us = kineto_event.start_us() - start_record.start_us() rel_end_us = rel_start_us + kineto_event.duration_us() abs_end_us = kineto_event.start_us() + kineto_event.duration_us() @@ -1068,16 +1092,8 @@ def parse_kineto_results(result): sequence_nr=kineto_event.sequence_nr(), device_type=kineto_event.device_type(), ) - # associate CUDA kernels with a CPU event - if (kineto_event.device_type() == 0 and not is_async and - kineto_event.correlation_id() in cuda_corr_map): - for evt in cuda_corr_map[kineto_event.correlation_id()]: - fe.append_kernel( - evt.name(), - evt.device_index(), - evt.start_us(), - evt.start_us() + evt.duration_us()) function_events.append(fe) + function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end]) return function_events diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index 998667da6d97..2c1c1974e9a8 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -97,7 +97,7 @@ struct TORCH_API KinetoEvent { } KinetoEvent& sequenceNr(int64_t sequence_nr) { - sequence_nr_ = sequence_nr_; + sequence_nr_ = sequence_nr; return *this; } @@ -149,7 +149,7 @@ struct TORCH_API KinetoEvent { uint64_t start_thread_id_ = 0; uint64_t end_thread_id_ = 0; uint64_t fwd_thread_id_ = 0; - int64_t sequence_nr_ = 0; + int64_t sequence_nr_ = -1; uint8_t scope_ = 0; uint8_t activity_type_; From 5aacc1ca50641a292b213d64ddd78a76b75a6722 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Wed, 11 Nov 2020 23:39:20 -0800 Subject: [PATCH 51/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls Node ID ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 11.000us 64.71% 11.000us 11.000us 1 0 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.000us 17.65% 3.000us 3.000us 1 0 Memcpy DtoH (Device -> Pageable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 11.76% 2.000us 2.000us 1 0 Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 5.88% 1.000us 1.000us 1 0 aten::mm 13.86% 421.014ms 27.73% 842.019ms 421.010ms 0.000us 0.00% 0.000us 0.000us 2 0 aten::empty 0.00% 25.000us 0.00% 25.000us 12.500us 0.000us 0.00% 0.000us 0.000us 2 0 aten::stride 0.00% 0.000us 0.00% 0.000us 0.000us 0.000us 0.00% 0.000us 0.000us 3 0 aten::add 36.55% 1.110s 73.11% 2.220s 1.110s 0.000us 0.00% 0.000us 0.000us 2 0 aten::to 0.00% 9.000us 0.00% 99.000us 99.000us 0.000us 0.00% 0.000us 0.000us 1 0 aten::empty_strided 0.00% 21.000us 0.00% 21.000us 21.000us 0.000us 0.00% 0.000us 0.000us 1 0 aten::copy_ 0.00% 69.000us 0.00% 133.000us 66.500us 0.000us 0.00% 0.000us 0.000us 2 0 cudaFree 13.00% 394.907ms 13.00% 394.907ms 394.907ms 0.000us 0.00% 0.000us 0.000us 1 0 cudaDeviceGetAttribute 0.00% 1.000us 0.00% 1.000us 0.091us 0.000us 0.00% 0.000us 0.000us 11 0 cudaMalloc 0.02% 632.000us 0.02% 632.000us 210.667us 0.000us 0.00% 0.000us 0.000us 3 0 cudaMemcpy 0.00% 20.000us 0.00% 20.000us 20.000us 0.000us 0.00% 0.000us 0.000us 1 0 cudaEventCreateWithFlags 0.00% 9.000us 0.00% 9.000us 0.562us 0.000us 0.00% 0.000us 0.000us 16 0 cudaLaunchKernel 36.55% 1.110s 36.55% 1.110s 555.021ms 0.000us 0.00% 0.000us 0.000us 2 0 cudaMemcpyAsync 0.00% 33.000us 0.00% 33.000us 33.000us 0.000us 0.00% 0.000us 0.000us 1 0 cudaStreamSynchronize 0.00% 4.000us 0.00% 4.000us 4.000us 0.000us 0.00% 0.000us 0.000us 1 0 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/profiler.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index db6abfabccc8..a37a08d5377a 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -82,10 +82,10 @@ def _set_kernels(self): evt.id in cuda_corr_map): for k_evt in cuda_corr_map[evt.id]: evt.append_kernel( - k_evt.name(), - k_evt.device_index(), - k_evt.start_us(), - k_evt.start_us() + k_evt.duration_us()) + k_evt.name, + k_evt.device_index, + k_evt.time_range.start, + k_evt.time_range.end) def _populate_cpu_children(self): """Populates child events into each underlying FunctionEvent object. @@ -803,7 +803,7 @@ class FunctionEvent(FormattedTimesMixin): def __init__( self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None, stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, - is_remote=False, sequence_nr=-1, node_id=-1, device_type=0): + is_remote=False, sequence_nr=-1, node_id=-1, device_type=0, device_index=0): self.id: int = id self.node_id: int = node_id self.name: str = name @@ -823,6 +823,7 @@ def __init__( self.is_remote: bool = is_remote self.sequence_nr: int = sequence_nr self.device_type: int = device_type + self.device_index: int = device_index def append_kernel(self, name, device, start, end): assert self.device_type == 0 # CPU @@ -1091,6 +1092,7 @@ def parse_kineto_results(result): is_async=is_async, sequence_nr=kineto_event.sequence_nr(), device_type=kineto_event.device_type(), + device_index=kineto_event.device_index(), ) function_events.append(fe) From 651f5565d40c781b71f87d3dd0f85a2563adb693 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Thu, 12 Nov 2020 10:23:02 -0800 Subject: [PATCH 52/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls Node ID ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 11.000us 64.71% 11.000us 11.000us 1 0 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.000us 17.65% 3.000us 3.000us 1 0 Memcpy DtoH (Device -> Pageable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 11.76% 2.000us 2.000us 1 0 Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 5.88% 1.000us 1.000us 1 0 aten::mm 13.86% 421.014ms 27.73% 842.019ms 421.010ms 0.000us 0.00% 0.000us 0.000us 2 0 aten::empty 0.00% 25.000us 0.00% 25.000us 12.500us 0.000us 0.00% 0.000us 0.000us 2 0 aten::stride 0.00% 0.000us 0.00% 0.000us 0.000us 0.000us 0.00% 0.000us 0.000us 3 0 aten::add 36.55% 1.110s 73.11% 2.220s 1.110s 0.000us 0.00% 0.000us 0.000us 2 0 aten::to 0.00% 9.000us 0.00% 99.000us 99.000us 0.000us 0.00% 0.000us 0.000us 1 0 aten::empty_strided 0.00% 21.000us 0.00% 21.000us 21.000us 0.000us 0.00% 0.000us 0.000us 1 0 aten::copy_ 0.00% 69.000us 0.00% 133.000us 66.500us 0.000us 0.00% 0.000us 0.000us 2 0 cudaFree 13.00% 394.907ms 13.00% 394.907ms 394.907ms 0.000us 0.00% 0.000us 0.000us 1 0 cudaDeviceGetAttribute 0.00% 1.000us 0.00% 1.000us 0.091us 0.000us 0.00% 0.000us 0.000us 11 0 cudaMalloc 0.02% 632.000us 0.02% 632.000us 210.667us 0.000us 0.00% 0.000us 0.000us 3 0 cudaMemcpy 0.00% 20.000us 0.00% 20.000us 20.000us 0.000us 0.00% 0.000us 0.000us 1 0 cudaEventCreateWithFlags 0.00% 9.000us 0.00% 9.000us 0.562us 0.000us 0.00% 0.000us 0.000us 16 0 cudaLaunchKernel 36.55% 1.110s 36.55% 1.110s 555.021ms 0.000us 0.00% 0.000us 0.000us 2 0 cudaMemcpyAsync 0.00% 33.000us 0.00% 33.000us 33.000us 0.000us 0.00% 0.000us 0.000us 1 0 cudaStreamSynchronize 0.00% 4.000us 0.00% 4.000us 4.000us 0.000us 0.00% 0.000us 0.000us 1 0 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/profiler.py | 4 +++- torch/csrc/autograd/profiler_kineto.cpp | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index a37a08d5377a..fa40f0d0a402 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -883,7 +883,9 @@ def cuda_time_total(self): if self.is_async: return 0 if self.device_type == 0: # CPU - return sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) + # account for the kernels in the children ops + return (sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) + + sum(ch.cuda_time_total for ch in self.cpu_children)) else: assert self.device_type == 1 # CUDA return self.time_range.elapsed_us() diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 9c1340a9a7e4..0de6b048fa4f 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -279,7 +279,11 @@ KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) { start_us_ = activity.timestamp(); duration_us_ = activity.duration(); correlation_id_ = activity.correlationId(); + //std::cerr << "DEBUG: " name_ << ": setting corr. id to " << correlation_id_ << std::endl; activity_type_ = (uint8_t)activity.type(); + //if (activity.linkedActivity()) { + // std::cerr << "DEBUG: linkedActivity: " << activity.name() << " " << activity.linkedActivity()->deviceId() << " " << activity.linkedActivity()->resourceId() << std::endl; + //} return *this; } From 9997011df74bb27e6170a4a0be1a329cf773311e Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Thu, 12 Nov 2020 16:43:26 -0800 Subject: [PATCH 53/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 12.000us 63.16% 12.000us 12.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.750us 14.47% 2.750us 2.750us 1 Memcpy HtoD (Pagable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.250us 11.84% 2.250us 2.250us 1 Memcpy DtoH (Device -> Pagable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 10.53% 2.000us 2.000us 1 aten::mm 25.87% 364.400ms 25.87% 364.426ms 364.426ms 0.000us 0.00% 0.000us 0.000us 1 aten::empty 0.00% 39.585us 0.00% 39.585us 19.792us 0.000us 0.00% 0.000us 0.000us 2 aten::stride 0.00% 3.363us 0.00% 3.363us 1.121us 0.000us 0.00% 0.000us 0.000us 3 aten::add 74.12% 1.044s 74.12% 1.044s 1.044s 0.000us 0.00% 0.000us 0.000us 1 aten::to 0.00% 13.155us 0.01% 116.398us 116.398us 0.000us 0.00% 0.000us 0.000us 1 aten::empty_strided 0.00% 30.365us 0.00% 30.365us 30.365us 0.000us 0.00% 0.000us 0.000us 1 aten::copy_ 0.01% 72.878us 0.01% 72.878us 72.878us 0.000us 0.00% 0.000us 0.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls Node ID ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 11.000us 64.71% 11.000us 11.000us 1 0 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.000us 17.65% 3.000us 3.000us 1 0 Memcpy DtoH (Device -> Pageable) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 11.76% 2.000us 2.000us 1 0 Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 5.88% 1.000us 1.000us 1 0 aten::mm 13.86% 421.014ms 27.73% 842.019ms 421.010ms 0.000us 0.00% 0.000us 0.000us 2 0 aten::empty 0.00% 25.000us 0.00% 25.000us 12.500us 0.000us 0.00% 0.000us 0.000us 2 0 aten::stride 0.00% 0.000us 0.00% 0.000us 0.000us 0.000us 0.00% 0.000us 0.000us 3 0 aten::add 36.55% 1.110s 73.11% 2.220s 1.110s 0.000us 0.00% 0.000us 0.000us 2 0 aten::to 0.00% 9.000us 0.00% 99.000us 99.000us 0.000us 0.00% 0.000us 0.000us 1 0 aten::empty_strided 0.00% 21.000us 0.00% 21.000us 21.000us 0.000us 0.00% 0.000us 0.000us 1 0 aten::copy_ 0.00% 69.000us 0.00% 133.000us 66.500us 0.000us 0.00% 0.000us 0.000us 2 0 cudaFree 13.00% 394.907ms 13.00% 394.907ms 394.907ms 0.000us 0.00% 0.000us 0.000us 1 0 cudaDeviceGetAttribute 0.00% 1.000us 0.00% 1.000us 0.091us 0.000us 0.00% 0.000us 0.000us 11 0 cudaMalloc 0.02% 632.000us 0.02% 632.000us 210.667us 0.000us 0.00% 0.000us 0.000us 3 0 cudaMemcpy 0.00% 20.000us 0.00% 20.000us 20.000us 0.000us 0.00% 0.000us 0.000us 1 0 cudaEventCreateWithFlags 0.00% 9.000us 0.00% 9.000us 0.562us 0.000us 0.00% 0.000us 0.000us 16 0 cudaLaunchKernel 36.55% 1.110s 36.55% 1.110s 555.021ms 0.000us 0.00% 0.000us 0.000us 2 0 cudaMemcpyAsync 0.00% 33.000us 0.00% 33.000us 33.000us 0.000us 0.00% 0.000us 0.000us 1 0 cudaStreamSynchronize 0.00% 4.000us 0.00% 4.000us 4.000us 0.000us 0.00% 0.000us 0.000us 1 0 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- test/test_profiler.py | 18 ++++++--- torch/autograd/profiler.py | 49 +++++++++++++------------ torch/csrc/autograd/init.cpp | 4 +- torch/csrc/autograd/profiler_kineto.cpp | 24 ++++++++++-- torch/csrc/autograd/profiler_kineto.h | 5 +++ torch/csrc/autograd/profiler_legacy.h | 4 -- 6 files changed, 67 insertions(+), 37 deletions(-) diff --git a/test/test_profiler.py b/test/test_profiler.py index 6d7618ca8a0b..797ad0995913 100644 --- a/test/test_profiler.py +++ b/test/test_profiler.py @@ -100,15 +100,22 @@ def forward(self, x): torch._C._set_graph_executor_optimize(prev_opt) + def payload(self): + x = torch.randn(10, 10).cuda() + y = torch.randn(10, 10).cuda() + z = torch.mm(x, y) + z = z + y + z = z.cpu() + @unittest.skipIf(not kineto_available(), "Kineto is required") @unittest.skipIf(not torch.cuda.is_available(), "CUDA is required") def test_kineto(self): - x = torch.randn(10, 10).cuda() - y = torch.randn(10, 10).cuda() + with profile(use_cuda=True, use_kineto=True): + self.payload() + + # rerun to avoid initial start overhead with profile(use_cuda=True, use_kineto=True) as p: - z = torch.mm(x, y) - z = z + y - z = z.cpu() + self.payload() print(p.key_averages().table( sort_by="self_cuda_time_total", row_limit=-1)) found_gemm = False @@ -120,6 +127,7 @@ def test_kineto(self): found_memcpy = True self.assertTrue(found_gemm) self.assertTrue(found_memcpy) + # p.export_chrome_trace("/tmp/test_trace.json") if __name__ == '__main__': run_tests() diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index fa40f0d0a402..92d0e2848a47 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -44,7 +44,6 @@ def __init__(self, *args, **kwargs): def build_tree(self): self._populate_cpu_children() self._remove_dup_nodes() - self._set_kernels() self._set_backward_stacktraces() self._tree_built = True @@ -68,25 +67,6 @@ def _remove_dup_nodes(self): self.clear() self.extend(new_evts) - def _set_kernels(self): - # associate CUDA kernels with CPU events - cuda_corr_map = {} - for evt in self: - if evt.device_type == 1: # CUDA - if evt.id not in cuda_corr_map: - cuda_corr_map[evt.id] = [] - cuda_corr_map[evt.id].append(evt) - - for evt in self: - if (evt.device_type == 0 and not evt.is_async and - evt.id in cuda_corr_map): - for k_evt in cuda_corr_map[evt.id]: - evt.append_kernel( - k_evt.name, - k_evt.device_index, - k_evt.time_range.start, - k_evt.time_range.end) - def _populate_cpu_children(self): """Populates child events into each underlying FunctionEvent object. One event is a child of another if [s1, e1) is inside [s2, e2). Where @@ -414,6 +394,7 @@ def __init__( self.profile_memory = profile_memory self.with_stack = with_stack self.use_cpu = use_cpu + self.kineto_results = None if not self.use_cpu: assert use_kineto, \ "Device-only events supported only with Kineto (use_kineto=True)" @@ -465,8 +446,8 @@ def __exit__(self, exc_type, exc_val, exc_tb): if not self.enabled: return if self.kineto_activities: - results = torch.autograd._disable_profiler() - parsed_results = parse_kineto_results(results) + self.kineto_results = torch.autograd._disable_profiler() + parsed_results = parse_kineto_results(self.kineto_results) else: records = torch.autograd._disable_profiler_legacy() parsed_results = parse_legacy_records(records) @@ -502,8 +483,11 @@ def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=Non def export_chrome_trace(self, path): self._check_finish() - assert self.function_events is not None - return self.function_events.export_chrome_trace(path) + if self.kineto_results is not None: + self.kineto_results.save(path) + else: + assert self.function_events is not None + return self.function_events.export_chrome_trace(path) export_chrome_trace.__doc__ = EventList.export_chrome_trace.__doc__ def key_averages(self, group_by_input_shape=False, group_by_stack_n=0): @@ -1062,6 +1046,7 @@ def parse_kineto_results(result): # Create and return FunctionEvent list string_table = StringTable() function_events = [] + cuda_corr_map = {} for kineto_event in result.events(): if filter_name(kineto_event.name()): continue @@ -1097,6 +1082,22 @@ def parse_kineto_results(result): device_index=kineto_event.device_index(), ) function_events.append(fe) + if kineto_event.device_type() == 1: # CUDA + corr_id = kineto_event.linked_correlation_id() + if corr_id > 0 and corr_id not in cuda_corr_map: + cuda_corr_map[corr_id] = [] + cuda_corr_map[corr_id].append(kineto_event) + + # associate CUDA kernels with CPU events + for fe in function_events: + if (fe.device_type == 0 and not fe.is_async and + fe.id in cuda_corr_map): + for k_evt in cuda_corr_map[fe.id]: + fe.append_kernel( + k_evt.name(), + k_evt.device_index(), + k_evt.start_us(), + k_evt.start_us() + k_evt.duration_us()) function_events.sort(key=lambda evt: [evt.time_range.start, -evt.time_range.end]) return function_events diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index b187a4f8524c..d67d823f24c0 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -129,7 +129,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { // device type, currently: CPU or CUDA .def("device_type", [](const KinetoEvent& e) { return (uint8_t)e.deviceType(); - }); + }) + // correlation id of a linked event + .def("linked_correlation_id", &KinetoEvent::linkedCorrelationId); py::class_(m, "ProfilerResult") .def("events", [](const ProfilerResultWrapper& r) { diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 0de6b048fa4f..62c95f8cf83a 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -20,6 +20,11 @@ uint64_t next_correlation_id() { return corr_id_++; } +inline int64_t getTimeUs() { + using namespace std::chrono; + return duration_cast(high_resolution_clock::now().time_since_epoch()).count(); +} + std::string shapesToStr(const std::vector>& shapes); struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { @@ -44,6 +49,16 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { // if (ctx->shapes && !ctx->shapes->empty()) { // op.inputDims = shapesToStr(*ctx->shapes); // } + + // Not setting atm + op.inputTypes = "[]"; + op.arguments = "[]"; + op.outputDims = "[]"; + op.outputTypes = "[]"; + op.inputNames = "[]"; + op.outputNames = "[]"; + + // op.threadId = pthread_self(); { std::lock_guard guard(state_mutex_); @@ -82,6 +97,8 @@ struct TORCH_API KinetoThreadLocalState : public ProfilerThreadLocalState { for (auto idx = 0; idx < cpu_trace->activities.size(); ++idx) { if (kineto_events_[idx].hasShapes()) { cpu_trace->activities[idx].inputDims = shapesToStr(kineto_events_[idx].shapes()); + } else { + cpu_trace->activities[idx].inputDims = "[]"; } } } @@ -237,9 +254,7 @@ void enableProfiler( pushProfilingCallbacks(); } - if (!libkineto::api().activityProfiler().isActive()) { - libkineto::api().activityProfiler().startTrace(); - } + libkineto::api().activityProfiler().startTrace(); state->mark("__start_profile", false); } @@ -284,6 +299,9 @@ KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) { //if (activity.linkedActivity()) { // std::cerr << "DEBUG: linkedActivity: " << activity.name() << " " << activity.linkedActivity()->deviceId() << " " << activity.linkedActivity()->resourceId() << std::endl; //} + if (activity.linkedActivity()) { + linked_correlation_id_ = activity.linkedActivity()->correlationId(); + } return *this; } diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index 2c1c1974e9a8..732dd6ed5f2d 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -140,6 +140,10 @@ struct TORCH_API KinetoEvent { return *this; } + uint64_t linkedCorrelationId() const { + return linked_correlation_id_; + } + int64_t deviceResourceId() const { return device_resource_id_; } @@ -161,6 +165,7 @@ struct TORCH_API KinetoEvent { uint64_t start_us_ = 0; uint64_t duration_us_ = 0; uint64_t correlation_id_ = 0; + uint64_t linked_correlation_id_ = 0; int64_t device_resource_id_ = 0; }; diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h index bc1381e40469..86c9f81f7fee 100644 --- a/torch/csrc/autograd/profiler_legacy.h +++ b/torch/csrc/autograd/profiler_legacy.h @@ -90,10 +90,6 @@ inline int64_t getTime() { #endif } -inline int64_t getTimeUs() { - return getTime() / 1000; -} - enum class C10_API_ENUM EventKind : uint16_t { Mark, PushRange, From bde96f63b9cb6ca614be902cdfefe4f97aa08e56 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Thu, 12 Nov 2020 22:37:15 -0800 Subject: [PATCH 54/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 1.000us 2 sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 2.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 Memcpy DtoH (Device -> Pageable) 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 aten::randn 5.17% 74.000us 6.71% 96.000us 48.000us 0.000us 0.00% 0.000us 0.000us 2 aten::empty 1.33% 19.000us 1.33% 19.000us 4.750us 0.000us 0.00% 0.000us 0.000us 4 aten::normal_ 1.05% 15.000us 1.05% 15.000us 7.500us 0.000us 0.00% 0.000us 0.000us 2 aten::to 77.90% 1.114ms 91.61% 1.310ms 436.667us 0.000us 0.00% 3.000us 1.000us 3 aten::empty_strided 2.52% 36.000us 2.52% 36.000us 12.000us 0.000us 0.00% 0.000us 0.000us 3 aten::copy_ 2.73% 39.000us 11.19% 160.000us 53.333us 0.000us 0.00% 3.000us 1.000us 3 cudaMemcpyAsync 4.34% 62.000us 4.34% 62.000us 20.667us 0.000us 0.00% 0.000us 0.000us 3 cudaStreamSynchronize 1.61% 23.000us 1.61% 23.000us 7.667us 0.000us 0.00% 0.000us 0.000us 3 aten::mm 0.21% 3.000us 7.20% 103.000us 103.000us 0.000us 0.00% 2.000us 2.000us 1 aten::stride 0.21% 3.000us 0.21% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3 cudaLaunchKernel 2.45% 35.000us 2.45% 35.000us 17.500us 0.000us 0.00% 0.000us 0.000us 2 aten::add 0.49% 7.000us 4.27% 61.000us 61.000us 0.000us 0.00% 1.000us 1.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- benchmarks/profiler_benchmark/profiler_bench.py | 3 ++- torch/csrc/autograd/profiler_kineto.cpp | 3 +++ torch/csrc/autograd/profiler_kineto.h | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/benchmarks/profiler_benchmark/profiler_bench.py b/benchmarks/profiler_benchmark/profiler_bench.py index 9f7e7fc2ac06..8b6a596c0e3b 100644 --- a/benchmarks/profiler_benchmark/profiler_bench.py +++ b/benchmarks/profiler_benchmark/profiler_bench.py @@ -51,11 +51,12 @@ def parallel_task(x): INTERNAL_ITER = args.internal_iter for profiling_enabled in [False, True]: - print("Profiling {}, tensor size {}x{}, use cuda: {}, with stacks: {}, use script: {}".format( + print("Profiling {}, tensor size {}x{}, use cuda: {}, use kineto: {}, with stacks: {}, use script: {}".format( "enabled" if profiling_enabled else "disabled", args.profiling_tensor_size, args.profiling_tensor_size, args.with_cuda, + args.use_kineto, args.with_stack, args.use_script)) diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index 704bb0df790f..bf4a39b5b13d 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -335,7 +335,10 @@ ProfilerResult::ProfilerResult( ProfilerResult::~ProfilerResult() {} void ProfilerResult::save(const std::string& path) { + // Kineto's save is destructive + TORCH_CHECK(!saved_, "Trace is already saved"); trace_->save(path); + saved_ = true; } #endif diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index 667b5da31551..a1c2b2122e41 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -190,6 +190,7 @@ struct TORCH_API ProfilerResult { void save(const std::string& path); private: + bool saved_ = false; std::vector events_; thread_event_lists legacy_events_; std::unique_ptr trace_; From b1a0292af7df5a4fd6fa3a45bdeb7e09bf8d4d41 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Fri, 13 Nov 2020 11:48:33 -0800 Subject: [PATCH 55/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 1.000us 2 sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 2.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 Memcpy DtoH (Device -> Pageable) 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 aten::randn 5.17% 74.000us 6.71% 96.000us 48.000us 0.000us 0.00% 0.000us 0.000us 2 aten::empty 1.33% 19.000us 1.33% 19.000us 4.750us 0.000us 0.00% 0.000us 0.000us 4 aten::normal_ 1.05% 15.000us 1.05% 15.000us 7.500us 0.000us 0.00% 0.000us 0.000us 2 aten::to 77.90% 1.114ms 91.61% 1.310ms 436.667us 0.000us 0.00% 3.000us 1.000us 3 aten::empty_strided 2.52% 36.000us 2.52% 36.000us 12.000us 0.000us 0.00% 0.000us 0.000us 3 aten::copy_ 2.73% 39.000us 11.19% 160.000us 53.333us 0.000us 0.00% 3.000us 1.000us 3 cudaMemcpyAsync 4.34% 62.000us 4.34% 62.000us 20.667us 0.000us 0.00% 0.000us 0.000us 3 cudaStreamSynchronize 1.61% 23.000us 1.61% 23.000us 7.667us 0.000us 0.00% 0.000us 0.000us 3 aten::mm 0.21% 3.000us 7.20% 103.000us 103.000us 0.000us 0.00% 2.000us 2.000us 1 aten::stride 0.21% 3.000us 0.21% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3 cudaLaunchKernel 2.45% 35.000us 2.45% 35.000us 17.500us 0.000us 0.00% 0.000us 0.000us 2 aten::add 0.49% 7.000us 4.27% 61.000us 61.000us 0.000us 0.00% 1.000us 1.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- test/test_jit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_jit.py b/test/test_jit.py index c7df2c33350f..5014af17e490 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -2552,10 +2552,10 @@ def fn(x): for e in prof.function_events: if e.name == "aten::mul": self.assertTrue(e.thread not in mul_events) - mul_events[e.thread] = e.cpu_interval.elapsed_us() + mul_events[e.thread] = e.time_range.elapsed_us() elif e.name == "other_fn": self.assertTrue(e.thread not in other_fn_events) - other_fn_events[e.thread] = e.cpu_interval.elapsed_us() + other_fn_events[e.thread] = e.time_range.elapsed_us() self.assertTrue(len(mul_events) == 2) self.assertTrue(len(other_fn_events) == 2) From 09a4762f0322423bbeb84dd367c7fdda3969112b Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 17 Nov 2020 00:57:59 -0800 Subject: [PATCH 56/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 1.000us 2 sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 2.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 Memcpy DtoH (Device -> Pageable) 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 aten::randn 5.17% 74.000us 6.71% 96.000us 48.000us 0.000us 0.00% 0.000us 0.000us 2 aten::empty 1.33% 19.000us 1.33% 19.000us 4.750us 0.000us 0.00% 0.000us 0.000us 4 aten::normal_ 1.05% 15.000us 1.05% 15.000us 7.500us 0.000us 0.00% 0.000us 0.000us 2 aten::to 77.90% 1.114ms 91.61% 1.310ms 436.667us 0.000us 0.00% 3.000us 1.000us 3 aten::empty_strided 2.52% 36.000us 2.52% 36.000us 12.000us 0.000us 0.00% 0.000us 0.000us 3 aten::copy_ 2.73% 39.000us 11.19% 160.000us 53.333us 0.000us 0.00% 3.000us 1.000us 3 cudaMemcpyAsync 4.34% 62.000us 4.34% 62.000us 20.667us 0.000us 0.00% 0.000us 0.000us 3 cudaStreamSynchronize 1.61% 23.000us 1.61% 23.000us 7.667us 0.000us 0.00% 0.000us 0.000us 3 aten::mm 0.21% 3.000us 7.20% 103.000us 103.000us 0.000us 0.00% 2.000us 2.000us 1 aten::stride 0.21% 3.000us 0.21% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3 cudaLaunchKernel 2.45% 35.000us 2.45% 35.000us 17.500us 0.000us 0.00% 0.000us 0.000us 2 aten::add 0.49% 7.000us 4.27% 61.000us 61.000us 0.000us 0.00% 1.000us 1.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/_C/_autograd.pyi | 26 ++++++++++++++++--- torch/autograd/profiler.py | 8 +++--- torch/csrc/autograd/init.cpp | 8 +++--- .../rpc/server_process_global_profiler.py | 2 +- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi index 926457fe80ee..2f4368d1a003 100644 --- a/torch/_C/_autograd.pyi +++ b/torch/_C/_autograd.pyi @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Set from enum import Enum # Defined in tools/autograd/init.cpp @@ -8,7 +8,11 @@ class ProfilerState(Enum): CPU = ... CUDA = ... NVTX = ... + KINETO = ... +class ProfilerActivity(Enum): + CPU = ... + CUDA = ... class ProfilerConfig: def __init__( @@ -37,9 +41,25 @@ class ProfilerEvent: def thread_id(self) -> int: ... ... +class KinetoEvent: + def name(self) -> str: ... + def device_index(self) -> int: ... + def start_us(self) -> int: ... + def duration_us(self) -> int: ... + ... + +class ProfilerResult: + def events(self) -> List[KinetoEvent]: ... + def legacy_events(self) -> List[List[ProfilerEvent]]: ... + def save(self, str) -> None: ... -def _enable_profiler(config: ProfilerConfig) -> None: ... -def _disable_profiler() -> List[List[ProfilerEvent]]: ... +def _enable_profiler(config: ProfilerConfig, activities: Set[ProfilerActivity]) -> None: ... +def _prepare_profiler(config: ProfilerConfig, activities: Set[ProfilerActivity]) -> None: ... +def _disable_profiler() -> ProfilerResult: ... def _profiler_enabled() -> bool: ... +def kineto_available() -> bool: ... def _enable_record_function(enable: bool) -> None: ... def _set_empty_test_observer(is_global: bool, sampling_prob: float) -> None: ... + +def _enable_profiler_legacy(config: ProfilerConfig) -> None: ... +def _disable_profiler_legacy() -> List[List[ProfilerEvent]]: ... diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 614ad123eee7..090cc209d77a 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -6,7 +6,7 @@ from collections import defaultdict, namedtuple from operator import attrgetter -from typing import List, Tuple, Optional +from typing import Dict, List, Tuple, Optional try: # Available in Python >= 3.2 @@ -275,9 +275,9 @@ def key_averages(self, group_by_input_shapes=False, group_by_stack_n=0): An EventList containing FunctionEventAvg objects. """ assert self._tree_built - stats = defaultdict(FunctionEventAvg) + stats: Dict[Tuple[str, ...], FunctionEventAvg] = defaultdict(FunctionEventAvg) - def get_key(event, group_by_input_shapes, group_by_stack_n): + def get_key(event, group_by_input_shapes, group_by_stack_n) -> Tuple[str, ...]: key = [str(event.key), str(event.node_id)] if group_by_input_shapes: key.append(str(event.input_shapes)) @@ -1046,7 +1046,7 @@ def parse_kineto_results(result): # Create and return FunctionEvent list string_table = StringTable() function_events = [] - cuda_corr_map = {} + cuda_corr_map: Dict[int, List[torch.autograd.KinetoEvent]] = {} for kineto_event in result.events(): if filter_name(kineto_event.name()): continue diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index 244531c6ed8b..ea06f475a629 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -133,10 +133,10 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { // correlation id of a linked event .def("linked_correlation_id", &KinetoEvent::linkedCorrelationId); - py::class_(m, "ProfilerResult") - .def("events", &ProfilerResult::events) - .def("legacy_events", &ProfilerResult::legacy_events) - .def("save", &ProfilerResult::save); + py::class_(m, "ProfilerResult") + .def("events", &ProfilerResult::events) + .def("legacy_events", &ProfilerResult::legacy_events) + .def("save", &ProfilerResult::save); m.def("_enable_profiler", enableProfiler); m.def("_disable_profiler", disableProfiler); diff --git a/torch/distributed/rpc/server_process_global_profiler.py b/torch/distributed/rpc/server_process_global_profiler.py index 0f4ba8d53817..bd06003aa506 100644 --- a/torch/distributed/rpc/server_process_global_profiler.py +++ b/torch/distributed/rpc/server_process_global_profiler.py @@ -145,7 +145,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): process_global_function_events = [] for thread_local_events in process_global_events: # Parse from ``Event``s to ``FunctionEvent``s. - thread_local_function_events = torch.autograd.profiler.parse_event_records( + thread_local_function_events = torch.autograd.profiler.parse_legacy_records( thread_local_events ) thread_local_function_events.sort( From cafee0f71a0c51f0eb44e8ec85338098937e3047 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 17 Nov 2020 01:44:54 -0800 Subject: [PATCH 57/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 1.000us 2 sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 2.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 Memcpy DtoH (Device -> Pageable) 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 aten::randn 5.17% 74.000us 6.71% 96.000us 48.000us 0.000us 0.00% 0.000us 0.000us 2 aten::empty 1.33% 19.000us 1.33% 19.000us 4.750us 0.000us 0.00% 0.000us 0.000us 4 aten::normal_ 1.05% 15.000us 1.05% 15.000us 7.500us 0.000us 0.00% 0.000us 0.000us 2 aten::to 77.90% 1.114ms 91.61% 1.310ms 436.667us 0.000us 0.00% 3.000us 1.000us 3 aten::empty_strided 2.52% 36.000us 2.52% 36.000us 12.000us 0.000us 0.00% 0.000us 0.000us 3 aten::copy_ 2.73% 39.000us 11.19% 160.000us 53.333us 0.000us 0.00% 3.000us 1.000us 3 cudaMemcpyAsync 4.34% 62.000us 4.34% 62.000us 20.667us 0.000us 0.00% 0.000us 0.000us 3 cudaStreamSynchronize 1.61% 23.000us 1.61% 23.000us 7.667us 0.000us 0.00% 0.000us 0.000us 3 aten::mm 0.21% 3.000us 7.20% 103.000us 103.000us 0.000us 0.00% 2.000us 2.000us 1 aten::stride 0.21% 3.000us 0.21% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3 cudaLaunchKernel 2.45% 35.000us 2.45% 35.000us 17.500us 0.000us 0.00% 0.000us 0.000us 2 aten::add 0.49% 7.000us 4.27% 61.000us 61.000us 0.000us 0.00% 1.000us 1.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- CMakeLists.txt | 2 +- benchmarks/profiler_benchmark/profiler_bench.py | 1 - torch/autograd/profiler.py | 6 +++--- torch/csrc/autograd/profiler_kineto.cpp | 2 +- torch/csrc/autograd/profiler_legacy.cpp | 4 +--- torch/csrc/autograd/profiler_legacy.h | 2 +- 6 files changed, 7 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4547807f9e36..fe8d21fc0766 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -524,7 +524,7 @@ if(USE_KINETO AND (NOT USE_CUDA)) set(USE_KINETO OFF) endif() -if (USE_KINETO AND MSVC) +if(USE_KINETO AND MSVC) message(STATUS "Not using libkineto in a Windows build.") set(USE_KINETO OFF) endif() diff --git a/benchmarks/profiler_benchmark/profiler_bench.py b/benchmarks/profiler_benchmark/profiler_bench.py index 8b6a596c0e3b..0cc8c33a1334 100644 --- a/benchmarks/profiler_benchmark/profiler_bench.py +++ b/benchmarks/profiler_benchmark/profiler_bench.py @@ -1,5 +1,4 @@ import argparse -import statistics import sys import timeit import torch diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 090cc209d77a..28b40dc32620 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -869,7 +869,7 @@ def cuda_time_total(self): if self.device_type == 0: # CPU # account for the kernels in the children ops return (sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) + - sum(ch.cuda_time_total for ch in self.cpu_children)) + sum(ch.cuda_time_total for ch in self.cpu_children)) else: assert self.device_type == 1 # CUDA return self.time_range.elapsed_us() @@ -1056,11 +1056,11 @@ def parse_kineto_results(result): cpu_memory_usage = 0 cuda_memory_usage = 0 - if kineto_event.device_type() == 0: # CPU + if kineto_event.device_type() == 0: # CPU # find the corresponding memory allocation events for mem_record in mem_records: if (mem_record.start_us() >= kineto_event.start_us() and - mem_record.start_us() <= abs_end_us): + mem_record.start_us() <= abs_end_us): cpu_memory_usage += mem_record.cpu_memory_usage() cuda_memory_usage += mem_record.cuda_memory_usage() is_async = kineto_event.start_thread_id() != kineto_event.end_thread_id() diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index bf4a39b5b13d..ff7be1b5cff1 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -7,7 +7,7 @@ #ifdef USE_KINETO #include -#include "libkineto.h" +#include #endif namespace torch { namespace autograd { namespace profiler { diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp index a8e37d45ee7e..3e3d458debc4 100644 --- a/torch/csrc/autograd/profiler_legacy.cpp +++ b/torch/csrc/autograd/profiler_legacy.cpp @@ -455,8 +455,6 @@ void registerCUDAMethods(CUDAStubs* stubs) { cuda_stubs = stubs; } -ProfilerConfig::~ProfilerConfig() = default; - at::IValue ProfilerConfig::toIValue() const { c10::impl::GenericList eventIValueList(at::AnyType::get()); eventIValueList.reserve(NUM_PROFILER_CFG_IVALUE_IDX); @@ -675,7 +673,7 @@ double LegacyEvent::cudaElapsedUs(const LegacyEvent& e) const { CUDAStubs::~CUDAStubs() = default; -static jit::CodeTemplate event_template(R"( +static const jit::CodeTemplate event_template(R"( { "name": "${name}", "ph": "X", diff --git a/torch/csrc/autograd/profiler_legacy.h b/torch/csrc/autograd/profiler_legacy.h index 86c9f81f7fee..9fb03c0b6ccd 100644 --- a/torch/csrc/autograd/profiler_legacy.h +++ b/torch/csrc/autograd/profiler_legacy.h @@ -368,7 +368,7 @@ struct TORCH_API ProfilerConfig { report_input_shapes(report_input_shapes), profile_memory(profile_memory), with_stack(with_stack) {} - ~ProfilerConfig(); + ~ProfilerConfig() = default; ProfilerState state; bool report_input_shapes; bool profile_memory; From 55028371704b7f9290d96dfafb90bdbbd234bb1b Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Tue, 17 Nov 2020 10:55:10 -0800 Subject: [PATCH 58/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 1.000us 2 sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 2.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 Memcpy DtoH (Device -> Pageable) 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 aten::randn 5.17% 74.000us 6.71% 96.000us 48.000us 0.000us 0.00% 0.000us 0.000us 2 aten::empty 1.33% 19.000us 1.33% 19.000us 4.750us 0.000us 0.00% 0.000us 0.000us 4 aten::normal_ 1.05% 15.000us 1.05% 15.000us 7.500us 0.000us 0.00% 0.000us 0.000us 2 aten::to 77.90% 1.114ms 91.61% 1.310ms 436.667us 0.000us 0.00% 3.000us 1.000us 3 aten::empty_strided 2.52% 36.000us 2.52% 36.000us 12.000us 0.000us 0.00% 0.000us 0.000us 3 aten::copy_ 2.73% 39.000us 11.19% 160.000us 53.333us 0.000us 0.00% 3.000us 1.000us 3 cudaMemcpyAsync 4.34% 62.000us 4.34% 62.000us 20.667us 0.000us 0.00% 0.000us 0.000us 3 cudaStreamSynchronize 1.61% 23.000us 1.61% 23.000us 7.667us 0.000us 0.00% 0.000us 0.000us 3 aten::mm 0.21% 3.000us 7.20% 103.000us 103.000us 0.000us 0.00% 2.000us 2.000us 1 aten::stride 0.21% 3.000us 0.21% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3 cudaLaunchKernel 2.45% 35.000us 2.45% 35.000us 17.500us 0.000us 0.00% 0.000us 0.000us 2 aten::add 0.49% 7.000us 4.27% 61.000us 61.000us 0.000us 0.00% 1.000us 1.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- torch/autograd/__init__.py | 4 +-- torch/autograd/profiler.py | 44 +++++++++++++------------ torch/csrc/autograd/init.cpp | 19 +++++++++-- torch/csrc/autograd/profiler_kineto.cpp | 4 --- 4 files changed, 42 insertions(+), 29 deletions(-) diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py index d97210806a90..a89dc018a885 100644 --- a/torch/autograd/__init__.py +++ b/torch/autograd/__init__.py @@ -18,7 +18,7 @@ from .grad_mode import no_grad, enable_grad, set_grad_enabled from .anomaly_mode import detect_anomaly, set_detect_anomaly from ..overrides import has_torch_function, handle_torch_function -from . import profiler +# from . import profiler from . import functional __all__ = ['Variable', 'Function', 'backward', 'grad_mode'] @@ -251,7 +251,7 @@ def variable(*args, **kwargs): raise RuntimeError("autograd initialization failed") # Import all native method/classes -from torch._C._autograd import (ProfilerActivity, ProfilerState, ProfilerConfig, ProfilerEvent, +from torch._C._autograd import (DeviceType, ProfilerActivity, ProfilerState, ProfilerConfig, ProfilerEvent, _enable_profiler_legacy, _disable_profiler_legacy, _profiler_enabled, _enable_record_function, _set_empty_test_observer, kineto_available) diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index 28b40dc32620..64d2151ec83f 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -1,6 +1,7 @@ import itertools from typing import Any import torch +from torch.autograd import DeviceType from torch.futures import Future from collections import defaultdict, namedtuple @@ -83,7 +84,7 @@ def _populate_cpu_children(self): # Some events can be async (i.e. start and end on different threads), # since it's generally undefined how to attribute children ranges to # async ranges, we do not use them when calculating nested ranges and stats - sync_events = [evt for evt in self if not evt.is_async and evt.device_type == 0] + sync_events = [evt for evt in self if not evt.is_async and evt.device_type == DeviceType.CPU] events = sorted( sync_events, key=attrgetter("thread"), @@ -340,7 +341,8 @@ class profile(object): use_kineto (bool, default False): experimental support for Kineto profiler - use_cpu (default True) - whether to profile CPU events + use_cpu (default True) - whether to profile CPU events; setting to False requires + use_kineto=True and can be used to lower the overhead for GPU-only profiling .. warning: Enabling memory profiling or source attribution incurs additional profiler @@ -787,7 +789,7 @@ class FunctionEvent(FormattedTimesMixin): def __init__( self, id, name, thread, start_us, end_us, fwd_thread=None, input_shapes=None, stack=None, scope=0, cpu_memory_usage=0, cuda_memory_usage=0, is_async=False, - is_remote=False, sequence_nr=-1, node_id=-1, device_type=0, device_index=0): + is_remote=False, sequence_nr=-1, node_id=-1, device_type=DeviceType.CPU, device_index=0): self.id: int = id self.node_id: int = node_id self.name: str = name @@ -806,11 +808,11 @@ def __init__( self.is_async: bool = is_async self.is_remote: bool = is_remote self.sequence_nr: int = sequence_nr - self.device_type: int = device_type + self.device_type: DeviceType = device_type self.device_index: int = device_index def append_kernel(self, name, device, start, end): - assert self.device_type == 0 # CPU + assert self.device_type == DeviceType.CPU self.kernels.append(Kernel(name, device, Interval(start, end))) def append_cpu_child(self, child): @@ -819,9 +821,9 @@ def append_cpu_child(self, child): One is supposed to append only direct children to the event to have correct self cpu time being reported. """ - assert(self.device_type == 0) # CPU + assert(self.device_type == DeviceType.CPU) assert(isinstance(child, FunctionEvent)) - assert(child.device_type == 0) + assert(child.device_type == DeviceType.CPU) self.cpu_children.append(child) def set_cpu_parent(self, parent): @@ -831,16 +833,16 @@ def set_cpu_parent(self, parent): the child's range interval is completely inside the parent's. We use this connection to determine the event is from top-level op or not. """ - assert(self.device_type == 0) # CPU + assert(self.device_type == DeviceType.CPU) assert(isinstance(parent, FunctionEvent)) - assert(parent.device_type == 0) + assert(parent.device_type == DeviceType.CPU) self.cpu_parent = parent # Note: async events don't have children, are not used when computing 'self' # metrics of other events, have only total cpu time @property def self_cpu_memory_usage(self): - if self.is_async or self.device_type != 0: # CPU + if self.is_async or self.device_type != DeviceType.CPU: return 0 return self.cpu_memory_usage - sum( [child.cpu_memory_usage for child in self.cpu_children] @@ -848,7 +850,7 @@ def self_cpu_memory_usage(self): @property def self_cuda_memory_usage(self): - if self.is_async or self.device_type != 0: # CPU + if self.is_async or self.device_type != DeviceType.CPU: return 0 return self.cuda_memory_usage - sum( [child.cuda_memory_usage for child in self.cpu_children] @@ -856,7 +858,7 @@ def self_cuda_memory_usage(self): @property def self_cpu_time_total(self): - if self.is_async or self.device_type != 0: + if self.is_async or self.device_type != DeviceType.CPU: return 0 return self.cpu_time_total - sum( [child.cpu_time_total for child in self.cpu_children] @@ -866,28 +868,28 @@ def self_cpu_time_total(self): def cuda_time_total(self): if self.is_async: return 0 - if self.device_type == 0: # CPU + if self.device_type == DeviceType.CPU: # account for the kernels in the children ops return (sum(kinfo.interval.elapsed_us() for kinfo in self.kernels) + sum(ch.cuda_time_total for ch in self.cpu_children)) else: - assert self.device_type == 1 # CUDA + assert self.device_type == DeviceType.CUDA return self.time_range.elapsed_us() @property def self_cuda_time_total(self): if self.is_async: return 0 - if self.device_type == 0: # CPU + if self.device_type == DeviceType.CPU: return self.cuda_time_total - \ sum([child.cuda_time_total for child in self.cpu_children]) else: - assert(self.device_type == 1) # CUDA + assert(self.device_type == DeviceType.CUDA) return self.cuda_time_total @property def cpu_time_total(self): - if self.device_type == 0: # CPU + if self.device_type == DeviceType.CPU: return self.time_range.elapsed_us() else: return 0 @@ -1056,7 +1058,7 @@ def parse_kineto_results(result): cpu_memory_usage = 0 cuda_memory_usage = 0 - if kineto_event.device_type() == 0: # CPU + if kineto_event.device_type() == DeviceType.CPU: # find the corresponding memory allocation events for mem_record in mem_records: if (mem_record.start_us() >= kineto_event.start_us() and @@ -1082,7 +1084,7 @@ def parse_kineto_results(result): device_index=kineto_event.device_index(), ) function_events.append(fe) - if kineto_event.device_type() == 1: # CUDA + if kineto_event.device_type() == DeviceType.CUDA: corr_id = kineto_event.linked_correlation_id() if corr_id > 0 and corr_id not in cuda_corr_map: cuda_corr_map[corr_id] = [] @@ -1090,7 +1092,7 @@ def parse_kineto_results(result): # associate CUDA kernels with CPU events for fe in function_events: - if (fe.device_type == 0 and not fe.is_async and + if (fe.device_type == DeviceType.CPU and not fe.is_async and fe.id in cuda_corr_map): for k_evt in cuda_corr_map[fe.id]: fe.append_kernel( @@ -1206,7 +1208,7 @@ def adjusted_time(cuda_record, cuda_records_map): is_async=is_async, is_remote=is_remote_event, sequence_nr=start.sequence_nr(), - device_type=0, # CPU + device_type=DeviceType.CPU, ) # note: async events have only cpu total time if not is_async and start.has_cuda(): diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp index ea06f475a629..78336ded0d88 100644 --- a/torch/csrc/autograd/init.cpp +++ b/torch/csrc/autograd/init.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -70,6 +71,20 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .def("correlation_id", &LegacyEvent::correlationId) .def("start_us", &LegacyEvent::cpuUs); + py::enum_(m, "DeviceType") + .value("CPU", c10::DeviceType::CPU) + .value("CUDA", c10::DeviceType::CUDA) + .value("MKLDNN", c10::DeviceType::MKLDNN) + .value("OPENGL", c10::DeviceType::OPENGL) + .value("OPENCL", c10::DeviceType::OPENCL) + .value("IDEEP", c10::DeviceType::IDEEP) + .value("HIP", c10::DeviceType::HIP) + .value("FPGA", c10::DeviceType::FPGA) + .value("MSNPU", c10::DeviceType::MSNPU) + .value("XLA", c10::DeviceType::XLA) + .value("Vulkan", c10::DeviceType::Vulkan) + .value("Metal", c10::DeviceType::Metal); + #ifdef USE_KINETO py::class_(m, "KinetoEvent") // name of the event @@ -126,9 +141,9 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject *unused) { .def("device_index", &KinetoEvent::deviceIndex) // for CUDA - stream id, for CPU - start thread id .def("device_resource_id", &KinetoEvent::deviceResourceId) - // device type, currently: CPU or CUDA + // device type .def("device_type", [](const KinetoEvent& e) { - return (uint8_t)e.deviceType(); + return e.deviceType(); }) // correlation id of a linked event .def("linked_correlation_id", &KinetoEvent::linkedCorrelationId); diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp index ff7be1b5cff1..33947a86f54f 100644 --- a/torch/csrc/autograd/profiler_kineto.cpp +++ b/torch/csrc/autograd/profiler_kineto.cpp @@ -294,11 +294,7 @@ KinetoEvent& KinetoEvent::activity(const libkineto::TraceActivity& activity) { start_us_ = activity.timestamp(); duration_us_ = activity.duration(); correlation_id_ = activity.correlationId(); - //std::cerr << "DEBUG: " name_ << ": setting corr. id to " << correlation_id_ << std::endl; activity_type_ = (uint8_t)activity.type(); - //if (activity.linkedActivity()) { - // std::cerr << "DEBUG: linkedActivity: " << activity.name() << " " << activity.linkedActivity()->deviceId() << " " << activity.linkedActivity()->resourceId() << std::endl; - //} if (activity.linkedActivity()) { linked_correlation_id_ = activity.linkedActivity()->correlationId(); } From f70a95c581d1bbe0e7d4522f2d6329297bbce9d1 Mon Sep 17 00:00:00 2001 From: ilia-cher Date: Fri, 20 Nov 2020 06:07:43 -0800 Subject: [PATCH 59/59] Update on "Use libkineto in profiler" Summary: Adding ability to use Kineto (CUPTI) to profile CUDA kernels Test Plan: USE_KINETO=1 USE_CUDA=1 USE_MKLDNN=1 BLAS=MKL BUILD_BINARY=1 python setup.py develop install python test/test_profiler.py python test/test_autograd.py -k test_profile python test/test_autograd.py -k test_record ``` ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ Memcpy HtoD (Pageable -> Device) 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 1.000us 2 sgemm_32x32x32_NN 0.00% 0.000us 0.00% 0.000us 0.000us 2.000us 33.33% 2.000us 2.000us 1 void at::native::vectorized_elementwise_kernel<4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 Memcpy DtoH (Device -> Pageable) 0.00% 0.000us 0.00% 0.000us 0.000us 1.000us 16.67% 1.000us 1.000us 1 aten::randn 5.17% 74.000us 6.71% 96.000us 48.000us 0.000us 0.00% 0.000us 0.000us 2 aten::empty 1.33% 19.000us 1.33% 19.000us 4.750us 0.000us 0.00% 0.000us 0.000us 4 aten::normal_ 1.05% 15.000us 1.05% 15.000us 7.500us 0.000us 0.00% 0.000us 0.000us 2 aten::to 77.90% 1.114ms 91.61% 1.310ms 436.667us 0.000us 0.00% 3.000us 1.000us 3 aten::empty_strided 2.52% 36.000us 2.52% 36.000us 12.000us 0.000us 0.00% 0.000us 0.000us 3 aten::copy_ 2.73% 39.000us 11.19% 160.000us 53.333us 0.000us 0.00% 3.000us 1.000us 3 cudaMemcpyAsync 4.34% 62.000us 4.34% 62.000us 20.667us 0.000us 0.00% 0.000us 0.000us 3 cudaStreamSynchronize 1.61% 23.000us 1.61% 23.000us 7.667us 0.000us 0.00% 0.000us 0.000us 3 aten::mm 0.21% 3.000us 7.20% 103.000us 103.000us 0.000us 0.00% 2.000us 2.000us 1 aten::stride 0.21% 3.000us 0.21% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3 cudaLaunchKernel 2.45% 35.000us 2.45% 35.000us 17.500us 0.000us 0.00% 0.000us 0.000us 2 aten::add 0.49% 7.000us 4.27% 61.000us 61.000us 0.000us 0.00% 1.000us 1.000us 1 ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ``` [ghstack-poisoned] --- benchmarks/profiler_benchmark/profiler_bench.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/benchmarks/profiler_benchmark/profiler_bench.py b/benchmarks/profiler_benchmark/profiler_bench.py index 0cc8c33a1334..75cd490fed2e 100644 --- a/benchmarks/profiler_benchmark/profiler_bench.py +++ b/benchmarks/profiler_benchmark/profiler_bench.py @@ -37,7 +37,8 @@ def parallel_task(x): parser.add_argument('--profiling_tensor_size', default=1, type=int) parser.add_argument('--workload', default='loop', type=str) parser.add_argument('--internal_iter', default=256, type=int) - parser.add_argument('--timer_min_run_time', default=100, type=int) + parser.add_argument('--timer_min_run_time', default=10, type=int) + parser.add_argument('--cuda_only', action='store_true') args = parser.parse_args() @@ -83,7 +84,8 @@ def payload(): with torch.autograd.profiler.profile( use_cuda=args.with_cuda, with_stack=args.with_stack, - use_kineto=args.use_kineto) as prof: + use_kineto=args.use_kineto, + use_cpu=not args.cuda_only) as prof: x = workload(input_x) return x else: