diff --git a/libkineto/CMakeLists.txt b/libkineto/CMakeLists.txt index 003de73b0..3f61dd92f 100644 --- a/libkineto/CMakeLists.txt +++ b/libkineto/CMakeLists.txt @@ -42,19 +42,19 @@ endif() # Set LIBKINETO_NOCUPTI to explicitly disable CUPTI # Otherwise, CUPTI is disabled if not found -IF (NOT CUDA_SOURCE_DIR AND NOT CUPTI_INCLUDE_DIR) +IF (NOT CUDA_SOURCE_DIR OR NOT CUPTI_INCLUDE_DIR OR NOT CUDA_cupti_LIBRARY) set(LIBKINETO_NOCUPTI ON CACHE BOOL "" FORCE) endif() # Define file lists if (LIBKINETO_NOCUPTI) - get_filelist("get_libkineto_cpu_only_srcs()" LIBKINETO_SRCS) + get_filelist("get_libkineto_cpu_only_srcs(with_api=False)" LIBKINETO_SRCS) message(INFO " CUPTI unavailable or disabled - not building GPU profilers") else() - get_filelist("get_libkineto_srcs()" LIBKINETO_SRCS) + get_filelist("get_libkineto_srcs(with_api=False)" LIBKINETO_SRCS) endif() get_filelist("get_libkineto_public_headers()" LIBKINETO_PUBLIC_HEADERS) -set(LIBKINETO_API_SRCS "${LIBKINETO_SOURCE_DIR}/libkineto_api.cpp") +get_filelist("get_libkineto_api_srcs()" LIBKINETO_API_SRCS) add_library(kineto_base OBJECT ${LIBKINETO_SRCS}) add_library(kineto_api OBJECT ${LIBKINETO_API_SRCS}) @@ -69,13 +69,22 @@ set_target_properties(kineto_base kineto_api PROPERTIES CXX_EXTENSIONS NO CXX_VISIBILITY_PRESET hidden) -target_compile_options(kineto_base PRIVATE "-DKINETO_NAMESPACE=libkineto" -"-std=gnu++14") -target_compile_options(kineto_api PRIVATE "-std=gnu++14") +set(KINETO_COMPILE_OPTIONS "-DKINETO_NAMESPACE=libkineto") +list(APPEND KINETO_COMPILE_OPTIONS "-DFMT_HEADER_ONLY") +if(NOT MSVC) + list(APPEND KINETO_COMPILE_OPTIONS "-std=c++14") +else() + list(APPEND KINETO_COMPILE_OPTIONS "/std:c++14") + list(APPEND KINETO_COMPILE_OPTIONS "-DWIN32_LEAN_AND_MEAN") + list(APPEND KINETO_COMPILE_OPTIONS "-DNOGDI") +endif() if (NOT LIBKINETO_NOCUPTI) - target_compile_options(kineto_base PRIVATE "-DHAS_CUPTI") + list(APPEND KINETO_COMPILE_OPTIONS "-DHAS_CUPTI") endif() +target_compile_options(kineto_base PRIVATE "${KINETO_COMPILE_OPTIONS}") +target_compile_options(kineto_api PRIVATE "${KINETO_COMPILE_OPTIONS}") + if(NOT TARGET fmt) if(NOT FMT_SOURCE_DIR) set(FMT_SOURCE_DIR "${LIBKINETO_THIRDPARTY_DIR}/fmt" @@ -95,6 +104,8 @@ if(NOT TARGET fmt) endif() set(FMT_INCLUDE_DIR "${FMT_SOURCE_DIR}/include") +message(STATUS "Kineto: FMT_SOURCE_DIR = ${FMT_SOURCE_DIR}") +message(STATUS "Kineto: FMT_INCLUDE_DIR = ${FMT_INCLUDE_DIR}") if (NOT CUPTI_INCLUDE_DIR) set(CUPTI_INCLUDE_DIR "${CUDA_SOURCE_DIR}/extras/CUPTI/include") endif() @@ -112,6 +123,7 @@ target_include_directories(kineto_base PUBLIC $) target_include_directories(kineto_api PUBLIC + $ $) if(KINETO_LIBRARY_TYPE STREQUAL "default") @@ -132,10 +144,11 @@ else() message(FATAL_ERROR "Unsupported library type ${KINETO_LIBRARY_TYPE}") endif() -target_link_libraries(kineto "${CUDA_cupti_LIBRARY}") - -target_link_libraries(kineto $) -add_dependencies(kineto fmt) +if(NOT LIBKINETO_NOCUPTI) + target_link_libraries(kineto "${CUDA_cupti_LIBRARY}") +endif() +target_link_libraries(kineto $) +add_dependencies(kineto fmt::fmt-header-only) install(TARGETS kineto EXPORT kinetoLibraryConfig ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/libkineto/include/ActivityProfilerInterface.h b/libkineto/include/ActivityProfilerInterface.h index 92cfc14d7..bc6a050e8 100644 --- a/libkineto/include/ActivityProfilerInterface.h +++ b/libkineto/include/ActivityProfilerInterface.h @@ -9,6 +9,7 @@ #include #include +#include #include #include "ActivityType.h" @@ -17,7 +18,7 @@ namespace libkineto { class ActivityProfilerController; -class CpuTraceBuffer; +struct CpuTraceBuffer; class Config; class ActivityProfilerInterface { @@ -76,6 +77,14 @@ class ActivityProfilerInterface { virtual bool enableForRegion(const std::string& match) { return true; } + + // Saves information for the current thread to be used in profiler output + // Client must record any new kernel thread where the activity has occured. + virtual void recordThreadInfo() {} + + // Record trace metadata, currently supporting only string key and values, + // values with the same key are overwritten + virtual void addMetadata(const std::string& key, const std::string& value) = 0; }; } // namespace libkineto diff --git a/libkineto/include/ActivityTraceInterface.h b/libkineto/include/ActivityTraceInterface.h index ebddcc7c1..28def766d 100644 --- a/libkineto/include/ActivityTraceInterface.h +++ b/libkineto/include/ActivityTraceInterface.h @@ -12,7 +12,7 @@ namespace libkineto { -class TraceActivity; +struct TraceActivity; class ActivityTraceInterface { public: diff --git a/libkineto/include/ActivityType.h b/libkineto/include/ActivityType.h index 6a377d29e..2dcf1d7f4 100644 --- a/libkineto/include/ActivityType.h +++ b/libkineto/include/ActivityType.h @@ -7,15 +7,30 @@ #pragma once +#include +#include + namespace libkineto { enum class ActivityType { - CPU_OP, + CPU_OP = 0, // cpu side ops + USER_ANNOTATION, + GPU_USER_ANNOTATION, GPU_MEMCPY, GPU_MEMSET, - CONCURRENT_KERNEL, + CONCURRENT_KERNEL, // on-device kernels EXTERNAL_CORRELATION, - CUDA_RUNTIME + CUDA_RUNTIME, // host side cuda runtime events + GLOW_RUNTIME, // host side glow runtime events + CPU_INSTANT_EVENT, // host side point-like events + ENUM_COUNT }; +const char* toString(ActivityType t); +ActivityType toActivityType(const std::string& str); + +// Return an array of all activity types except COUNT +constexpr int activityTypeCount = (int)ActivityType::ENUM_COUNT; +const std::array activityTypes(); + } // namespace libkineto diff --git a/libkineto/include/ClientTraceActivity.h b/libkineto/include/ClientTraceActivity.h deleted file mode 100644 index aefd2f6dd..000000000 --- a/libkineto/include/ClientTraceActivity.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -#include "TraceActivity.h" - -namespace libkineto { - -struct ClientTraceActivity : TraceActivity { - ClientTraceActivity() = default; - ClientTraceActivity(ClientTraceActivity&&) = default; - ClientTraceActivity& operator=(ClientTraceActivity&&) = default; - ~ClientTraceActivity() override {} - - int64_t deviceId() const override { - return cachedPid(); - } - - int64_t resourceId() const override { - return sysThreadId; - } - - int64_t timestamp() const override { - return startTime; - } - - int64_t duration() const override { - return endTime - startTime; - } - - int64_t correlationId() const override { - return correlation; - } - - ActivityType type() const override { - return ActivityType::CPU_OP; - } - - const std::string name() const override { - return opType; - } - - const TraceActivity* linkedActivity() const override { - return nullptr; - } - - void log(ActivityLogger& logger) const override { - // Unimplemented by default - } - - int64_t startTime{0}; - int64_t endTime{0}; - int64_t correlation{0}; - int device{-1}; - // TODO: Add OS abstraction - pthread_t pthreadId{}; - int32_t sysThreadId{0}; - std::string opType; - std::string inputDims; - std::string inputTypes; - std::string arguments; - std::string outputDims; - std::string outputTypes; - std::string inputNames; - std::string outputNames; - std::string callStack; -}; - -} // namespace libkineto diff --git a/libkineto/include/GenericTraceActivity.h b/libkineto/include/GenericTraceActivity.h new file mode 100644 index 000000000..9eb7a6cb9 --- /dev/null +++ b/libkineto/include/GenericTraceActivity.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include + +#include "ThreadUtil.h" +#include "TraceActivity.h" +#include "TraceSpan.h" + +namespace libkineto { + +// @lint-ignore-every CLANGTIDY cppcoreguidelines-non-private-member-variables-in-classes +// @lint-ignore-every CLANGTIDY cppcoreguidelines-pro-type-member-init +class GenericTraceActivity : public TraceActivity { + + public: + GenericTraceActivity() = delete; + + GenericTraceActivity( + const TraceSpan& trace, ActivityType type, const std::string& name) + : activityType(type), activityName(name), traceSpan_(&trace) { + } + + int64_t deviceId() const override { + return device; + } + + int64_t resourceId() const override { + return resource; + } + + int64_t timestamp() const override { + return startTime; + } + + int64_t duration() const override { + return endTime - startTime; + } + + int64_t correlationId() const override { + return id; + } + + ActivityType type() const override { + return activityType; + } + + const std::string name() const override { + return activityName; + } + + const TraceActivity* linkedActivity() const override { + return nullptr; + } + + const TraceSpan* traceSpan() const override { + return traceSpan_; + } + + void log(ActivityLogger& logger) const override; + + //Encode client side metadata as a key/value string. + void addMetadata(const std::string& key, const std::string& value) { + metadata_.push_back(fmt::format("\"{}\": {}", key, value)); + } + + const std::string getMetadata() const { + return fmt::format("{}", fmt::join(metadata_, ", ")); + } + + virtual ~GenericTraceActivity() {}; + + int64_t startTime{0}; + int64_t endTime{0}; + int32_t id{0}; + int32_t device{0}; + int32_t resource{0}; + ActivityType activityType; + std::string activityName; + + private: + const TraceSpan* traceSpan_; + std::vector metadata_; +}; + +} // namespace libkineto diff --git a/libkineto/include/IActivityProfiler.h b/libkineto/include/IActivityProfiler.h new file mode 100644 index 000000000..9af4535a9 --- /dev/null +++ b/libkineto/include/IActivityProfiler.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include "GenericTraceActivity.h" +#include "ActivityTraceInterface.h" + +/* This file includes an abstract base class for an activity profiler + * that can be implemented by multiple tracing agents in the application. + * The high level Kineto profiler can co-ordinate start and end of tracing + * and combine together events from multiple such activity profilers. + */ + +namespace libkineto { + +enum class TraceStatus { + READY, // Accepting trace requests + WARMUP, // Performing trace warmup + RECORDING, // Actively collecting activities + PROCESSING, // Recording is complete, preparing results + ERROR, // One or more errors (and possibly also warnings) occurred. + WARNING, // One or more warnings occurred. +}; + +/* IActivityProfilerSession: + * an opaque object that can be used by a high level profiler to + * start/stop and return trace events. + */ +class IActivityProfilerSession { + + public: + virtual ~IActivityProfilerSession() {} + + // start the trace collection synchronously + virtual void start() = 0; + + // stop the trace collection synchronously + virtual void stop() = 0; + + TraceStatus status() { + return status_; + } + + // returns list of Trace Activities + virtual std::vector& activities() = 0; + + // returns errors with this trace + virtual std::vector errors() = 0; + + // processes trace activities using logger + virtual void processTrace(ActivityLogger& logger) = 0; + + // XXX define trace formats + // virtual save(string name, TraceFormat format) + + protected: + TraceStatus status_ = TraceStatus::READY; +}; + + +/* Activity Profiler Plugins: + * These allow other frameworks to integrate into Kineto's primariy + * activity profiler. While the primary activity profiler handles + * timing the trace collections and correlating events the plugins + * can become source of new trace activity types. + */ +class IActivityProfiler { + + public: + + virtual ~IActivityProfiler() {} + + // name of profiler + virtual const std::string& name() const = 0; + + // returns activity types this profiler supports + virtual const std::set& availableActivities() const = 0; + + // Calls prepare() on registered tracer providers passing in the relevant + // activity types. Returns a profiler session handle (including uuid?). + virtual std::unique_ptr configure( + const std::set& activity_types, + const std::string& config="") = 0; + + // asynchronous version of the above with future timestamp and duration. + virtual std::unique_ptr configure( + int64_t ts_ms, + int64_t duration_ms, + const std::set& activity_types, + const std::string& config = "") = 0; +}; + +} // namespace libkineto diff --git a/libkineto/include/ThreadUtil.h b/libkineto/include/ThreadUtil.h new file mode 100644 index 000000000..343680e33 --- /dev/null +++ b/libkineto/include/ThreadUtil.h @@ -0,0 +1,35 @@ +#pragma once + +#include +#include +#include +#include + +namespace libkineto { + +int32_t systemThreadId(); +int32_t threadId(); +bool setThreadName(const std::string& name); +std::string getThreadName(); + +int32_t processId(); +std::string processName(int32_t pid); + +struct ProcessInfo { + int32_t pid; + const std::string name; + const std::string label; +}; + +struct ThreadInfo { + ThreadInfo(int32_t tid, const std::string& name) : + tid(tid), name(name) {} + int32_t tid; + const std::string name; +}; + +// Return a list of pids and process names for the current process +// and its parents. +std::vector> pidCommandPairsOfAncestors(); + +} // namespace libkineto diff --git a/libkineto/include/TraceActivity.h b/libkineto/include/TraceActivity.h index d93787071..fbf0dc673 100644 --- a/libkineto/include/TraceActivity.h +++ b/libkineto/include/TraceActivity.h @@ -8,14 +8,13 @@ #pragma once #include -#include -#include #include "ActivityType.h" namespace libkineto { class ActivityLogger; +struct TraceSpan; // Generic activity interface is borrowed from tensorboard protobuf format. struct TraceActivity { @@ -35,23 +34,16 @@ struct TraceActivity { virtual const std::string name() const = 0; // Optional linked activity virtual const TraceActivity* linkedActivity() const = 0; + // Optional containing trace object + virtual const TraceSpan* traceSpan() const = 0; // Log activity virtual void log(ActivityLogger& logger) const = 0; -}; - -namespace { - // Caching pid is not safe across forks and clones but we currently - // don't support an active profiler in a forked process. - static inline pid_t cachedPid() { - static pid_t pid = getpid(); - return pid; - } - static inline int64_t nsToUs(int64_t ns) { + static int64_t nsToUs(int64_t ns) { // It's important that this conversion is the same everywhere. // No rounding! return ns / 1000; } -} +}; } // namespace libkineto diff --git a/libkineto/include/TraceSpan.h b/libkineto/include/TraceSpan.h index eac3e793a..f33e18fb9 100644 --- a/libkineto/include/TraceSpan.h +++ b/libkineto/include/TraceSpan.h @@ -7,12 +7,26 @@ #pragma once +#include #include #include namespace libkineto { struct TraceSpan { + TraceSpan() = delete; + TraceSpan( + int64_t startTime, int64_t endTime, std::string name) + : startTime(startTime), endTime(endTime), name(std::move(name)) { + } + TraceSpan( + int opCount, int it, std::string name, std::string prefix) + : opCount(opCount), + iteration(it), + name(std::move(name)), + prefix(std::move(prefix)) { + } + // FIXME: change to duration? int64_t startTime{0}; int64_t endTime{0}; @@ -20,8 +34,10 @@ struct TraceSpan { int iteration{-1}; // Name is used to identify timeline std::string name; - // Prefix used to distinguish sub-nets on the same timeline + // Prefix used to distinguish trace spans on the same timeline std::string prefix; + // Tracked by profiler for iteration trigger + bool tracked{false}; }; } // namespace libkineto diff --git a/libkineto/include/libkineto.h b/libkineto/include/libkineto.h index c7d723c72..f0ab1d619 100644 --- a/libkineto/include/libkineto.h +++ b/libkineto/include/libkineto.h @@ -9,7 +9,6 @@ #pragma once -#include #include #include #include @@ -24,9 +23,11 @@ #include "ActivityTraceInterface.h" #include "ActivityType.h" #include "ClientInterface.h" -#include "ClientTraceActivity.h" +#include "GenericTraceActivity.h" #include "TraceSpan.h" +#include "ThreadUtil.h" + extern "C" { void suppressLibkinetoLogMessages(); bool libkineto_init(bool cpuOnly, bool logOnError); @@ -37,9 +38,9 @@ namespace libkineto { class Config; struct CpuTraceBuffer { - TraceSpan span; + TraceSpan span{0, 0, "none"}; int gpuOpCount; - std::vector activities; + std::vector activities; }; class LibkinetoApi { @@ -98,7 +99,7 @@ class LibkinetoApi { std::unique_ptr activityProfiler_{}; ClientInterface* client_{}; - pthread_t clientRegisterThread_{0}; + int32_t clientRegisterThread_{0}; bool isLoaded_{false}; std::atomic_int netSizeThreshold_{}; @@ -108,4 +109,3 @@ class LibkinetoApi { LibkinetoApi& api(); } // namespace libkineto - diff --git a/libkineto/include/time_since_epoch.h b/libkineto/include/time_since_epoch.h index 5a813ada4..e90388d95 100644 --- a/libkineto/include/time_since_epoch.h +++ b/libkineto/include/time_since_epoch.h @@ -12,7 +12,7 @@ namespace libkineto { inline int64_t timeSinceEpoch( - const std::chrono::time_point& t) { + const std::chrono::time_point& t) { return std::chrono::duration_cast( t.time_since_epoch()) .count(); diff --git a/libkineto/libkineto_defs.bzl b/libkineto/libkineto_defs.bzl index 333f9f8c9..fcaa2d5fe 100644 --- a/libkineto/libkineto_defs.bzl +++ b/libkineto/libkineto_defs.bzl @@ -3,7 +3,13 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -def get_libkineto_srcs(): +def get_libkineto_api_srcs(): + return [ + "src/ThreadUtil.cpp", + "src/libkineto_api.cpp", + ] + +def get_libkineto_srcs(with_api = True): return [ "src/AbstractConfig.cpp", "src/ActivityProfiler.cpp", @@ -11,6 +17,7 @@ def get_libkineto_srcs(): "src/ActivityProfilerProxy.cpp", "src/Config.cpp", "src/ConfigLoader.cpp", + "src/CudaDeviceProperties.cpp", "src/CuptiActivityInterface.cpp", "src/CuptiEventInterface.cpp", "src/CuptiMetricInterface.cpp", @@ -19,17 +26,14 @@ def get_libkineto_srcs(): "src/EventProfilerController.cpp", "src/GenericTraceActivity.cpp", "src/Logger.cpp", - "src/ProcessInfo.cpp", - "src/ThreadName.cpp", "src/WeakSymbols.cpp", "src/cupti_strings.cpp", "src/init.cpp", - "src/libkineto_api.cpp", "src/output_csv.cpp", "src/output_json.cpp", - ] + ] + (get_libkineto_api_srcs() if with_api else []) -def get_libkineto_cpu_only_srcs(): +def get_libkineto_cpu_only_srcs(with_api = True): return [ "src/AbstractConfig.cpp", "src/ActivityProfiler.cpp", @@ -41,21 +45,22 @@ def get_libkineto_cpu_only_srcs(): "src/Demangle.cpp", "src/GenericTraceActivity.cpp", "src/Logger.cpp", - "src/ProcessInfo.cpp", - "src/ThreadName.cpp", "src/init.cpp", - "src/libkineto_api.cpp", "src/output_csv.cpp", "src/output_json.cpp", - ] + ] + (get_libkineto_api_srcs() if with_api else []) def get_libkineto_public_headers(): return [ "include/ActivityProfilerInterface.h", "include/ActivityType.h", "include/ClientInterface.h", + "include/GenericTraceActivity.h", "include/TraceActivity.h", + "include/GenericTraceActivity.h", + "include/IActivityProfiler.h", "include/TraceSpan.h", + "include/ThreadUtil.h", "include/libkineto.h", "include/time_since_epoch.h", ] diff --git a/libkineto/src/AbstractConfig.cpp b/libkineto/src/AbstractConfig.cpp index ebb18ba9c..5d36a909f 100644 --- a/libkineto/src/AbstractConfig.cpp +++ b/libkineto/src/AbstractConfig.cpp @@ -7,6 +7,7 @@ #include "AbstractConfig.h" +#include #include #include diff --git a/libkineto/src/AbstractConfig.h b/libkineto/src/AbstractConfig.h index a4477d9ed..7b631ec39 100644 --- a/libkineto/src/AbstractConfig.h +++ b/libkineto/src/AbstractConfig.h @@ -52,6 +52,7 @@ class AbstractConfig { return timestamp_; } + // Source config string that this was parsed from const std::string& source() const { return source_; } diff --git a/libkineto/src/ActivityBuffers.h b/libkineto/src/ActivityBuffers.h index f1004bb15..e482be217 100644 --- a/libkineto/src/ActivityBuffers.h +++ b/libkineto/src/ActivityBuffers.h @@ -10,7 +10,6 @@ #include #include -#include #include "libkineto.h" #include "CuptiActivityBuffer.h" @@ -19,7 +18,7 @@ namespace KINETO_NAMESPACE { struct ActivityBuffers { std::list> cpu; - std::unique_ptr> gpu; + std::unique_ptr gpu; }; } // namespace KINETO_NAMESPACE diff --git a/libkineto/src/ActivityLoggerFactory.h b/libkineto/src/ActivityLoggerFactory.h new file mode 100644 index 000000000..c22d07c2c --- /dev/null +++ b/libkineto/src/ActivityLoggerFactory.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace KINETO_NAMESPACE { + +class ActivityLogger; + +class ActivityLoggerFactory { + + public: + using FactoryFunc = + std::function(const std::string& url)>; + + // Add logger factory for a protocol prefix + void addProtocol(const std::string& protocol, FactoryFunc f) { + factories_[tolower(protocol)] = f; + } + + // Create a logger, invoking the factory for the protocol specified in url + std::unique_ptr makeLogger(const std::string& url) const { + std::string protocol = extractProtocol(url); + auto it = factories_.find(tolower(protocol)); + if (it != factories_.end()) { + return it->second(stripProtocol(url)); + } + throw std::invalid_argument(fmt::format( + "No logger registered for the {} protocol prefix", + protocol)); + return nullptr; + } + + private: + static std::string tolower(std::string s) { + std::transform(s.begin(), s.end(), s.begin(), + [](unsigned char c) { return std::tolower(c); } + ); + return s; + } + + static std::string extractProtocol(std::string url) { + return url.substr(0, url.find("://")); + } + + static std::string stripProtocol(std::string url) { + size_t pos = url.find("://"); + return pos == url.npos ? url : url.substr(pos + 3); + } + + std::map factories_; +}; + +} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/ActivityProfiler.cpp b/libkineto/src/ActivityProfiler.cpp index d4e343946..cb4237370 100644 --- a/libkineto/src/ActivityProfiler.cpp +++ b/libkineto/src/ActivityProfiler.cpp @@ -8,10 +8,7 @@ #include "ActivityProfiler.h" #include -#include -#include #include -#include #include #include #include @@ -32,6 +29,7 @@ #include "output_base.h" #include "Logger.h" +#include "ThreadUtil.h" using namespace std::chrono; using namespace libkineto; @@ -40,7 +38,7 @@ using std::string; namespace KINETO_NAMESPACE { bool ActivityProfiler::iterationTargetMatch( - const libkineto::CpuTraceBuffer& trace) { + libkineto::CpuTraceBuffer& trace) { const string& name = trace.span.name; bool match = (name == netIterationsTarget_); if (!match && applyNetFilterInternal(name) && @@ -55,6 +53,7 @@ bool ActivityProfiler::iterationTargetMatch( } if (match) { netIterationsTarget_ = name; + trace.span.tracked = true; LOG(INFO) << "Tracking net " << name << " for " << netIterationsTargetCount_ << " iterations"; } @@ -127,6 +126,7 @@ void ActivityProfiler::processTraceInternal(ActivityLogger& logger) { << " CPU buffers"; VLOG(0) << "Profile time range: " << captureWindowStartTime_ << " - " << captureWindowEndTime_; + logger.handleTraceStart(metadata_); for (auto& cpu_trace : traceBuffers_->cpu) { string trace_name = cpu_trace->span.name; VLOG(0) << "Processing CPU buffer for " << trace_name << " (" @@ -159,13 +159,17 @@ void ActivityProfiler::processTraceInternal(ActivityLogger& logger) { } #endif // HAS_CUPTI + for (const auto& session : sessions_){ + LOG(INFO) << "Processing child profiler trace"; + session->processTrace(logger); + } + finalizeTrace(*config_, logger); } ActivityProfiler::CpuGpuSpanPair& ActivityProfiler::recordTraceSpan( TraceSpan& span, int gpuOpCount) { - TraceSpan gpu_span{ - 0, 0, gpuOpCount, span.iteration, span.name, "GPU: "}; + TraceSpan gpu_span(gpuOpCount, span.iteration, span.name, "GPU: "); auto& iterations = traceSpans_[span.name]; iterations.push_back({span, gpu_span}); return iterations.back(); @@ -183,11 +187,9 @@ void ActivityProfiler::processCpuTrace( CpuGpuSpanPair& span_pair = recordTraceSpan(cpuTrace.span, cpuTrace.gpuOpCount); TraceSpan& cpu_span = span_pair.first; for (auto const& act : cpuTrace.activities) { - VLOG(2) << act.correlationId() << ": OP " << act.opType - << " tid: " << act.pthreadId; - if (logTrace) { - logger.handleCpuActivity(act, cpu_span); - recordThreadInfo(act.sysThreadId, act.pthreadId); + VLOG(2) << act.correlationId() << ": OP " << act.activityName; + if (logTrace && config_->selectedActivityTypes().count(act.type())) { + act.log(logger); } // Stash event so we can look it up later when processing GPU trace externalEvents_.insertEvent(&act); @@ -195,9 +197,6 @@ void ActivityProfiler::processCpuTrace( } if (logTrace) { logger.handleTraceSpan(cpu_span); - if (cpu_span.name == netIterationsTarget_) { - logger.handleIterationStart(cpu_span); - } } else { disabledTraceSpans_.insert(cpu_span.name); } @@ -206,48 +205,28 @@ void ActivityProfiler::processCpuTrace( #ifdef HAS_CUPTI inline void ActivityProfiler::handleCorrelationActivity( const CUpti_ActivityExternalCorrelation* correlation) { - switch(correlation->externalKind) { - case CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0: - externalEvents_.addCorrelation( - correlation->externalId, - correlation->correlationId, - ExternalEventMap::CorrelationFlowType::Default); - break; - case CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1: - externalEvents_.addCorrelation( - correlation->externalId, - correlation->correlationId, - ExternalEventMap::CorrelationFlowType::User); - break; - default: - LOG(ERROR) << "Received correlation activity with undefined kind: " - << correlation->externalKind; - break; - } - VLOG(2) << correlation->correlationId - << ": CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION"; + externalEvents_.addCorrelation( + correlation->externalId, correlation->correlationId); } #endif // HAS_CUPTI -const libkineto::ClientTraceActivity& -ActivityProfiler::ExternalEventMap::getClientTraceActivity( - uint32_t id, CorrelationFlowType flowType) { - static const libkineto::ClientTraceActivity nullOp_{}; +const libkineto::GenericTraceActivity& +ActivityProfiler::ExternalEventMap::correlatedActivity(uint32_t id) { + static const libkineto::GenericTraceActivity nullOp_( + defaultTraceSpan().first, ActivityType::CPU_OP, "NULL"); - auto& correlationMap = getCorrelationMap(flowType); - - auto* res = events_[correlationMap[id]]; + auto* res = events_[correlationMap_[id]]; if (res == nullptr) { // Entry may be missing because cpu trace hasn't been processed yet // Insert a dummy element so that we can check for this in insertEvent - events_[correlationMap[id]] = &nullOp_; + events_[correlationMap_[id]] = &nullOp_; res = &nullOp_; } return *res; } void ActivityProfiler::ExternalEventMap::insertEvent( - const libkineto::ClientTraceActivity* op) { + const libkineto::GenericTraceActivity* op) { if (events_[op->correlationId()] != nullptr) { LOG_EVERY_N(WARNING, 100) << "Events processed out of order - link will be missing"; @@ -256,52 +235,59 @@ void ActivityProfiler::ExternalEventMap::insertEvent( } void ActivityProfiler::ExternalEventMap::addCorrelation( - uint64_t external_id, uint32_t cuda_id, CorrelationFlowType flowType) { - switch(flowType){ - case Default: - defaultCorrelationMap_[cuda_id] = external_id; - break; - case User: - userCorrelationMap_[cuda_id] = external_id; - break; - } + uint64_t external_id, uint32_t cuda_id) { + correlationMap_[cuda_id] = external_id; } -static void initUserGpuSpan(GenericTraceActivity& userTraceActivity, - const libkineto::TraceActivity& cpuTraceActivity, - const libkineto::TraceActivity& gpuTraceActivity) { - userTraceActivity.device = gpuTraceActivity.deviceId(); - userTraceActivity.resource = gpuTraceActivity.resourceId(); - userTraceActivity.startTime = gpuTraceActivity.timestamp(); - userTraceActivity.endTime = gpuTraceActivity.timestamp() + gpuTraceActivity.duration(); - userTraceActivity.correlation = cpuTraceActivity.correlationId(); - userTraceActivity.activityType = cpuTraceActivity.type(); - userTraceActivity.activityName = cpuTraceActivity.name(); +static GenericTraceActivity createUserGpuSpan( + const libkineto::TraceActivity& cpuTraceActivity, + const libkineto::TraceActivity& gpuTraceActivity) { + GenericTraceActivity res( + *cpuTraceActivity.traceSpan(), + ActivityType::GPU_USER_ANNOTATION, + cpuTraceActivity.name()); + res.startTime = gpuTraceActivity.timestamp(); + res.device = gpuTraceActivity.deviceId(); + res.resource = gpuTraceActivity.resourceId(); + res.endTime = + gpuTraceActivity.timestamp() + gpuTraceActivity.duration(); + res.id = cpuTraceActivity.correlationId(); + return res; } void ActivityProfiler::GpuUserEventMap::insertOrExtendEvent( - const TraceActivity& cpuTraceActivity, - const TraceActivity& gpuTraceActivity) { - StreamKey key(gpuTraceActivity.deviceId(), gpuTraceActivity.resourceId()); - CorrelationSpanMap& correlationSpanMap = streamSpanMap[key]; - if (correlationSpanMap.count(cpuTraceActivity.correlationId()) == 0) { - GenericTraceActivity& userTraceActivity = correlationSpanMap[cpuTraceActivity.correlationId()]; - initUserGpuSpan(userTraceActivity, cpuTraceActivity, gpuTraceActivity); - } - GenericTraceActivity& userTraceActivity = correlationSpanMap[cpuTraceActivity.correlationId()]; - if (gpuTraceActivity.timestamp() < userTraceActivity.startTime || userTraceActivity.startTime == 0) { - userTraceActivity.startTime = gpuTraceActivity.timestamp(); - } - if ((gpuTraceActivity.timestamp() + gpuTraceActivity.duration()) > userTraceActivity.endTime) { - userTraceActivity.endTime = gpuTraceActivity.timestamp() + gpuTraceActivity.duration(); + const TraceActivity&, + const TraceActivity& gpuActivity) { + const TraceActivity& cpuActivity = *gpuActivity.linkedActivity(); + StreamKey key(gpuActivity.deviceId(), gpuActivity.resourceId()); + CorrelationSpanMap& correlationSpanMap = streamSpanMap_[key]; + auto it = correlationSpanMap.find(cpuActivity.correlationId()); + if (it == correlationSpanMap.end()) { + auto it_success = correlationSpanMap.insert({ + cpuActivity.correlationId(), createUserGpuSpan(cpuActivity, gpuActivity) + }); + it = it_success.first; + } + GenericTraceActivity& span = it->second; + if (gpuActivity.timestamp() < span.startTime || span.startTime == 0) { + span.startTime = gpuActivity.timestamp(); + } + int64_t gpu_activity_end = gpuActivity.timestamp() + gpuActivity.duration(); + if (gpu_activity_end > span.endTime) { + span.endTime = gpu_activity_end; } } +const ActivityProfiler::CpuGpuSpanPair& ActivityProfiler::defaultTraceSpan() { + static TraceSpan span(0, 0, "Unknown", ""); + static CpuGpuSpanPair span_pair(span, span); + return span_pair; +} + void ActivityProfiler::GpuUserEventMap::logEvents(ActivityLogger *logger) { - for (auto const& streamMapPair : streamSpanMap) { + for (auto const& streamMapPair : streamSpanMap_) { for (auto const& correlationSpanPair : streamMapPair.second) { - logger->handleGenericActivity( - correlationSpanPair.second); + correlationSpanPair.second.log(*logger); } } } @@ -332,9 +318,8 @@ inline void ActivityProfiler::handleRuntimeActivity( VLOG(2) << activity->correlationId << ": CUPTI_ACTIVITY_KIND_RUNTIME, cbid=" << activity->cbid << " tid=" << activity->threadId; - const ClientTraceActivity& ext = - externalEvents_.getClientTraceActivity(activity->correlationId, - ExternalEventMap::CorrelationFlowType::Default); + const GenericTraceActivity& ext = + externalEvents_.correlatedActivity(activity->correlationId); int32_t tid = activity->threadId; const auto& it = threadInfo_.find(tid); if (it != threadInfo_.end()) { @@ -398,26 +383,31 @@ inline void ActivityProfiler::handleGpuActivity( if (!loggingDisabled(ext)) { act.log(*logger); updateGpuNetSpan(act); - const ClientTraceActivity& extUser = - externalEvents_.getClientTraceActivity(act.correlationId(), - ExternalEventMap::CorrelationFlowType::User); + /* + const GenericTraceActivity& extUser = + externalEvents_.correlatedActivity(act.correlationId()); // Correlated CPU activity cannot have timestamp greater than the GPU activity's if (!timestampsInCorrectOrder(extUser, act)) { return; } - if (extUser.correlationId() != 0) { VLOG(2) << extUser.correlationId() << "," << act.correlationId() << " (user): "<< act.name(); - gpuUserEventMap_.insertOrExtendEvent(extUser, act); +*/ + if (config_->selectedActivityTypes().count(ActivityType::GPU_USER_ANNOTATION) && + act.linkedActivity() && + act.linkedActivity()->type() == ActivityType::USER_ANNOTATION) { + //gpuUserEventMap_.insertOrExtendEvent(act, act); } +// } } } template -inline void ActivityProfiler::handleGpuActivity(const T* act, ActivityLogger* logger) { - const ClientTraceActivity& extDefault = externalEvents_.getClientTraceActivity(act->correlationId, - ExternalEventMap::CorrelationFlowType::Default); +inline void ActivityProfiler::handleGpuActivity( + const T* act, ActivityLogger* logger) { + const GenericTraceActivity& extDefault = + externalEvents_.correlatedActivity(act->correlationId); handleGpuActivity(GpuActivity(act, extDefault), logger); } @@ -455,6 +445,24 @@ void ActivityProfiler::handleCuptiActivity(const CUpti_Activity* record, Activit } #endif // HAS_CUPTI +void ActivityProfiler::configureChildProfilers() { + // If child profilers are enabled create profiler sessions + for (auto& profiler: profilers_) { + int64_t start_time_ms = duration_cast( + profileStartTime_.time_since_epoch()).count(); + LOG(INFO) << "Running child profiler " << profiler->name() << " for " + << config_->activitiesOnDemandDuration().count() << " ms"; + auto session = profiler->configure( + start_time_ms, + config_->activitiesOnDemandDuration().count(), + std::set{ActivityType::CPU_OP} // TODO make configurable + ); + if (session) { + sessions_.push_back(std::move(session)); + } + } +} + void ActivityProfiler::configure( const Config& config, const time_point& now) { @@ -500,13 +508,13 @@ void ActivityProfiler::configure( LOG(INFO) << "Enabling GPU tracing"; cupti_.setMaxBufferSize(config_->activitiesMaxGpuBufferSize()); - time_point timestamp; + time_point timestamp; if (VLOG_IS_ON(1)) { - timestamp = high_resolution_clock::now(); + timestamp = system_clock::now(); } cupti_.enableCuptiActivities(config_->selectedActivityTypes()); if (VLOG_IS_ON(1)) { - auto t2 = high_resolution_clock::now(); + auto t2 = system_clock::now(); addOverheadSample( setupOverhead_, duration_cast(t2 - timestamp).count()); } @@ -518,6 +526,11 @@ void ActivityProfiler::configure( if (profileStartTime_ < now) { profileStartTime_ = now + config_->activitiesWarmupDuration(); } + + if (profilers_.size() > 0) { + configureChildProfilers(); + } + LOG(INFO) << "Tracing starting in " << duration_cast(profileStartTime_ - now).count() << "s"; @@ -532,6 +545,10 @@ void ActivityProfiler::startTraceInternal(const time_point& now) { libkineto::api().client()->start(); } VLOG(0) << "Warmup -> CollectTrace"; + for (auto& session: sessions_){ + LOG(INFO) << "Starting child profiler session"; + session->start(); + } currentRunloopState_ = RunloopState::CollectTrace; } @@ -541,13 +558,13 @@ void ActivityProfiler::stopTraceInternal(const time_point& now) { } #ifdef HAS_CUPTI if (!cpuOnly_) { - time_point timestamp; + time_point timestamp; if (VLOG_IS_ON(1)) { - timestamp = high_resolution_clock::now(); + timestamp = system_clock::now(); } cupti_.disableCuptiActivities(config_->selectedActivityTypes()); if (VLOG_IS_ON(1)) { - auto t2 = high_resolution_clock::now(); + auto t2 = system_clock::now(); addOverheadSample( setupOverhead_, duration_cast(t2 - timestamp).count()); } @@ -560,6 +577,10 @@ void ActivityProfiler::stopTraceInternal(const time_point& now) { static_cast::type>( currentRunloopState_.load()); } + for (auto& session: sessions_){ + LOG(INFO) << "Stopping child profiler session"; + session->stop(); + } currentRunloopState_ = RunloopState::ProcessTrace; } @@ -593,6 +614,7 @@ const time_point ActivityProfiler::performRunLoopStep( stopTraceInternal(now); resetInternal(); VLOG(0) << "Warmup -> WaitForRequest"; + break; } #endif // HAS_CUPTI @@ -618,7 +640,7 @@ const time_point ActivityProfiler::performRunLoopStep( // FIXME: Is this a good idea for synced start? { std::lock_guard guard(mutex_); - profileEndTime_ = time_point( + profileEndTime_ = time_point( microseconds(captureWindowStartTime_)) + config_->activitiesOnDemandDuration(); } @@ -656,23 +678,6 @@ const time_point ActivityProfiler::performRunLoopStep( return new_wakeup_time; } -// Extract process name from /proc/pid/cmdline. This does not have -// the 16 character limit that /proc/pid/status and /prod/pid/comm has. -const string processName(pid_t pid) { - FILE* cmdfile = fopen(fmt::format("/proc/{}/cmdline", pid).c_str(), "r"); - if (cmdfile != nullptr) { - char* command = nullptr; - int scanned = fscanf(cmdfile, "%ms", &command); - if (scanned > 0 && command) { - string ret(basename(command)); - free(command); - return ret; - } - } - VLOG(1) << "Failed to read process name for pid " << pid; - return ""; -} - void ActivityProfiler::finalizeTrace(const Config& config, ActivityLogger& logger) { LOG(INFO) << "Recorded nets:"; { @@ -683,9 +688,9 @@ void ActivityProfiler::finalizeTrace(const Config& config, ActivityLogger& logge } // Process names - string process_name = processName(getpid()); + string process_name = processName(processId()); if (!process_name.empty()) { - pid_t pid = getpid(); + int32_t pid = processId(); logger.handleProcessInfo( {pid, process_name, "CPU"}, captureWindowStartTime_); if (!cpuOnly_) { @@ -698,6 +703,7 @@ void ActivityProfiler::finalizeTrace(const Config& config, ActivityLogger& logge } } } + // Thread info for (auto pair : threadInfo_) { const auto& thread_info = pair.second; @@ -730,6 +736,8 @@ void ActivityProfiler::resetTraceData() { clientActivityTraceMap_.clear(); disabledTraceSpans_.clear(); traceBuffers_ = nullptr; + metadata_.clear(); + sessions_.clear(); } diff --git a/libkineto/src/ActivityProfiler.h b/libkineto/src/ActivityProfiler.h index 9d9958ee6..8c06e29e2 100644 --- a/libkineto/src/ActivityProfiler.h +++ b/libkineto/src/ActivityProfiler.h @@ -21,12 +21,12 @@ #include #include -#include "ProcessInfo.h" -#include "ThreadName.h" +#include "ThreadUtil.h" #include "TraceSpan.h" #include "libkineto.h" #include "output_base.h" #include "GenericTraceActivity.h" +#include "IActivityProfiler.h" namespace KINETO_NAMESPACE { @@ -102,27 +102,48 @@ class ActivityProfiler { return *config_; } + inline void recordThreadInfo() { + int32_t sysTid = systemThreadId(); + int32_t tid = threadId(); + std::lock_guard guard(mutex_); + if (threadInfo_.find(tid) == threadInfo_.end()) { + threadInfo_.emplace( + tid, + ThreadInfo(sysTid, getThreadName())); + } + } + + void addMetadata(const std::string& key, const std::string& value) { + std::lock_guard guard(mutex_); + metadata_[key] = value; + } + + void addActivityProfiler( + std::shared_ptr profiler) { + std::lock_guard guard(mutex_); + profilers_.push_back(profiler); + } + + protected: + + using CpuGpuSpanPair = std::pair; + static const CpuGpuSpanPair& defaultTraceSpan(); + private: + class ExternalEventMap { public: - enum CorrelationFlowType { - // Default flow type - Default, - // User annotated flow type - User - }; // The correlation id of the GPU activity - const libkineto::ClientTraceActivity& getClientTraceActivity( - uint32_t correlation_id, CorrelationFlowType flowType); - void insertEvent(const libkineto::ClientTraceActivity* op); + const libkineto::GenericTraceActivity& correlatedActivity( + uint32_t correlation_id); + void insertEvent(const libkineto::GenericTraceActivity* op); - void addCorrelation(uint64_t external_id, uint32_t cuda_id, CorrelationFlowType flowType); + void addCorrelation(uint64_t external_id, uint32_t cuda_id); void clear() { events_.clear(); - defaultCorrelationMap_.clear(); - userCorrelationMap_.clear(); + correlationMap_.clear(); } private: @@ -131,7 +152,7 @@ class ActivityProfiler { // but this class also fully owns the objects it is pointing to so // it's not so bad. This is done for performance reasons and is an // implementation detail of this class that might change. - std::unordered_map + std::unordered_map events_; // Cuda correlation id -> external correlation id for default events @@ -142,23 +163,7 @@ class ActivityProfiler { std::unordered_map< uint32_t, // Cuda correlation ID uint64_t> // External correlation ID - defaultCorrelationMap_; - - // Cuda correlation id -> external correlation id for user annotated - // events - // CUPTI provides a mechanism for correlating Cuda events to arbitrary - // external events, e.g.operator events from Caffe2. - // It also marks GPU activities with the Cuda event correlation ID. - // So by connecting the two, we get the complete picture. - std::unordered_map< - uint32_t, // Cuda correlation ID - uint64_t> // External correlation ID - userCorrelationMap_; - - std::unordered_map& - getCorrelationMap(CorrelationFlowType flowType) { - return flowType == User ? userCorrelationMap_ : defaultCorrelationMap_; - } + correlationMap_; }; // Map of gpu activities to user defined events @@ -173,16 +178,17 @@ class ActivityProfiler { void logEvents(ActivityLogger *logger); void clear() { - streamSpanMap.clear(); + streamSpanMap_.clear(); } private: // device id and stream name - typedef std::pair StreamKey; + using StreamKey = std::pair; // map of correlation id to TraceSpan - typedef std::unordered_map CorrelationSpanMap; - std::map streamSpanMap; + using CorrelationSpanMap = + std::unordered_map; + std::map streamSpanMap_; }; GpuUserEventMap gpuUserEventMap_; @@ -205,6 +211,8 @@ class ActivityProfiler { void finalizeTrace(const Config& config, ActivityLogger& logger); + void configureChildProfilers(); + // Process a single CPU trace void processCpuTrace( libkineto::CpuTraceBuffer& cpuTrace, @@ -219,13 +227,11 @@ class ActivityProfiler { // Record client trace span for subsequent lookups from activities // Also creates a corresponding GPU-side span. - using CpuGpuSpanPair = std::pair; CpuGpuSpanPair& recordTraceSpan(TraceSpan& span, int gpuOpCount); // Returns true if net name is to be tracked for a specified number of // iterations. - bool iterationTargetMatch( - const libkineto::CpuTraceBuffer& trace); + bool iterationTargetMatch(libkineto::CpuTraceBuffer& trace); // net name to id int netId(const std::string& netName); @@ -249,21 +255,13 @@ class ActivityProfiler { // Is logging disabled for this event? // Logging can be disabled due to operator count, net name filter etc. - inline bool loggingDisabled(const libkineto::TraceActivity& act) { + inline bool loggingDisabled(const libkineto::TraceActivity& act) const { const auto& it = clientActivityTraceMap_.find(act.correlationId()); return it != clientActivityTraceMap_.end() && disabledTraceSpans_.find(it->second->first.name) != disabledTraceSpans_.end(); } - inline void recordThreadInfo(pid_t tid, pthread_t pthreadId) { - if (threadInfo_.find((int32_t)pthreadId) == threadInfo_.end()) { - threadInfo_.emplace( - (int32_t)pthreadId, - ThreadInfo((int32_t) tid, getThreadName(tid))); - } - } - void resetTraceData(); void addOverheadSample(profilerOverhead& counter, int64_t overhead) { @@ -307,7 +305,8 @@ class ActivityProfiler { // Maintain a map of client trace activity to trace span. // Maps correlation id -> TraceSpan* held by traceSpans_. - std::unordered_map clientActivityTraceMap_; + using ActivityTraceMap = std::unordered_map; + ActivityTraceMap clientActivityTraceMap_; // Cache thread names and system thread ids for pthread ids // Note we're using the lower 32 bits of the (opaque) pthread id @@ -365,6 +364,14 @@ class ActivityProfiler { // Buffers where trace data is stored std::unique_ptr traceBuffers_; + // Trace metadata + std::unordered_map metadata_; + + // child activity profilers + std::vector> profilers_; + + // a vector of active profiler plugin sessions + std::vector> sessions_; }; } // namespace KINETO_NAMESPACE diff --git a/libkineto/src/ActivityProfilerController.cpp b/libkineto/src/ActivityProfilerController.cpp index e85421442..64b049b74 100644 --- a/libkineto/src/ActivityProfilerController.cpp +++ b/libkineto/src/ActivityProfilerController.cpp @@ -10,9 +10,10 @@ #include #include +#include "ActivityLoggerFactory.h" #include "ActivityTrace.h" #include "CuptiActivityInterface.h" -#include "ThreadName.h" +#include "ThreadUtil.h" #include "output_json.h" #include "output_membuf.h" @@ -22,8 +23,7 @@ using namespace std::chrono; namespace KINETO_NAMESPACE { -constexpr milliseconds kDefaultInactiveProfilerIntervalMsecs(1000); -constexpr milliseconds kDefaultActiveProfilerIntervalMsecs(200); +constexpr milliseconds kProfilerIntervalMsecs(1000); ActivityProfilerController::ActivityProfilerController(bool cpuOnly) { profiler_ = std::make_unique(CuptiActivityInterface::singleton(), cpuOnly); @@ -40,31 +40,29 @@ ActivityProfilerController::~ActivityProfilerController() { VLOG(0) << "Stopped activity profiler"; } +static ActivityLoggerFactory initLoggerFactory() { + ActivityLoggerFactory factory; + factory.addProtocol("file", [](const std::string& url) { + return std::unique_ptr(new ChromeTraceLogger(url)); + }); + return factory; +} + static ActivityLoggerFactory& loggerFactory() { - static ActivityLoggerFactory factory{nullptr}; + static ActivityLoggerFactory factory = initLoggerFactory(); return factory; } -void ActivityProfilerController::setLoggerFactory( - const ActivityLoggerFactory& factory) { - loggerFactory() = factory; +void ActivityProfilerController::addLoggerFactory( + const std::string& protocol, ActivityLoggerFactory::FactoryFunc factory) { + loggerFactory().addProtocol(protocol, factory); } static std::unique_ptr makeLogger(const Config& config) { if (config.activitiesLogToMemory()) { return std::make_unique(config); } - if (loggerFactory()) { - return loggerFactory()(config); - } - return std::make_unique( - config.activitiesLogFile(), - CuptiActivityInterface::singleton().smCount()); -} - -static milliseconds profilerInterval(bool profilerActive) { - return profilerActive ? kDefaultActiveProfilerIntervalMsecs - : kDefaultInactiveProfilerIntervalMsecs; + return loggerFactory().makeLogger(config.activitiesLogUrl()); } void ActivityProfilerController::profilerLoop() { @@ -72,7 +70,7 @@ void ActivityProfilerController::profilerLoop() { VLOG(0) << "Entering activity profiler loop"; auto now = system_clock::now(); - auto next_wakeup_time = now + profilerInterval(false); + auto next_wakeup_time = now + kProfilerIntervalMsecs; while (!stopRunloop_) { now = system_clock::now(); @@ -94,7 +92,7 @@ void ActivityProfilerController::profilerLoop() { } while (next_wakeup_time < now) { - next_wakeup_time += kDefaultActiveProfilerIntervalMsecs; + next_wakeup_time += kProfilerIntervalMsecs; } if (profiler_->isActive()) { @@ -144,7 +142,12 @@ std::unique_ptr ActivityProfilerController::stopTrace() auto logger = std::make_unique(profiler_->config()); profiler_->processTrace(*logger); profiler_->reset(); - return std::make_unique(std::move(logger), CuptiActivityInterface::singleton()); + return std::make_unique(std::move(logger), loggerFactory()); +} + +void ActivityProfilerController::addMetadata( + const std::string& key, const std::string& value) { + profiler_->addMetadata(key, value); } } // namespace KINETO_NAMESPACE diff --git a/libkineto/src/ActivityProfilerController.h b/libkineto/src/ActivityProfilerController.h index 33540109f..56eab1109 100644 --- a/libkineto/src/ActivityProfilerController.h +++ b/libkineto/src/ActivityProfilerController.h @@ -11,6 +11,7 @@ #include #include +#include "ActivityLoggerFactory.h" #include "ActivityProfiler.h" #include "ActivityProfilerInterface.h" #include "ActivityTraceInterface.h" @@ -20,9 +21,6 @@ namespace KINETO_NAMESPACE { class Config; -using ActivityLoggerFactory = - std::function(const Config&)>; - class ActivityProfilerController { public: explicit ActivityProfilerController(bool cpuOnly); @@ -32,7 +30,9 @@ class ActivityProfilerController { ~ActivityProfilerController(); - static void setLoggerFactory(const ActivityLoggerFactory& factory); + static void addLoggerFactory( + const std::string& protocol, + ActivityLoggerFactory::FactoryFunc factory); void scheduleTrace(const Config& config); @@ -57,6 +57,12 @@ class ActivityProfilerController { return profiler_->transferCpuTrace(std::move(cpuTrace)); } + void recordThreadInfo() { + profiler_->recordThreadInfo(); + } + + void addMetadata(const std::string& key, const std::string& value); + private: void profilerLoop(); diff --git a/libkineto/src/ActivityProfilerProxy.cpp b/libkineto/src/ActivityProfilerProxy.cpp index 3eddbe41c..bb8cbb06e 100644 --- a/libkineto/src/ActivityProfilerProxy.cpp +++ b/libkineto/src/ActivityProfilerProxy.cpp @@ -84,4 +84,13 @@ bool ActivityProfilerProxy::enableForRegion(const std::string& match) { return controller_->traceInclusionFilter(match); } +void ActivityProfilerProxy::addMetadata( + const std::string& key, const std::string& value) { + controller_->addMetadata(key, value); +} + +void ActivityProfilerProxy::recordThreadInfo() { + controller_->recordThreadInfo(); +} + } // namespace libkineto diff --git a/libkineto/src/ActivityProfilerProxy.h b/libkineto/src/ActivityProfilerProxy.h index 1eebfd63c..639c6d8d4 100644 --- a/libkineto/src/ActivityProfilerProxy.h +++ b/libkineto/src/ActivityProfilerProxy.h @@ -40,6 +40,8 @@ class ActivityProfilerProxy : public ActivityProfilerInterface { bool isActive() override; + void recordThreadInfo() override; + void scheduleTrace(const std::string& configStr) override; void scheduleTrace(const Config& config); @@ -58,6 +60,8 @@ class ActivityProfilerProxy : public ActivityProfilerInterface { bool enableForRegion(const std::string& match) override; + void addMetadata(const std::string& key, const std::string& value) override; + private: bool cpuOnly_{true}; ActivityProfilerController* controller_{nullptr}; diff --git a/libkineto/src/ActivityTrace.h b/libkineto/src/ActivityTrace.h index a10a25cfc..cd730c2e8 100644 --- a/libkineto/src/ActivityTrace.h +++ b/libkineto/src/ActivityTrace.h @@ -10,8 +10,8 @@ #include #include +#include "ActivityLoggerFactory.h" #include "ActivityTraceInterface.h" -#include "CuptiActivityInterface.h" #include "output_json.h" #include "output_membuf.h" @@ -20,22 +20,31 @@ namespace libkineto { class ActivityTrace : public ActivityTraceInterface { public: ActivityTrace( - std::unique_ptr logger, - CuptiActivityInterface& cuptiActivities) - : logger_(std::move(logger)), cuptiActivities_(cuptiActivities) {} + std::unique_ptr tmpLogger, + const ActivityLoggerFactory& factory) + : memLogger_(std::move(tmpLogger)), + loggerFactory_(factory) { + } const std::vector>* activities() override { - return logger_->traceActivities(); + return memLogger_->traceActivities(); }; - void save(const std::string& path) override { - ChromeTraceLogger chrome_logger(path, cuptiActivities_.smCount()); - return logger_->log(chrome_logger); + void save(const std::string& url) override { + std::string prefix; + // if no protocol is specified, default to file + if (url.find("://") == url.npos) { + prefix = "file://"; + } + memLogger_->log(*loggerFactory_.makeLogger(prefix + url)); }; private: - std::unique_ptr logger_; - CuptiActivityInterface& cuptiActivities_; + // Activities are logged into a buffer + std::unique_ptr memLogger_; + + // Alternative logger used by save() if protocol prefix is specified + const ActivityLoggerFactory& loggerFactory_; }; } // namespace libkineto diff --git a/libkineto/src/ActivityType.cpp b/libkineto/src/ActivityType.cpp new file mode 100644 index 000000000..403aa2d1b --- /dev/null +++ b/libkineto/src/ActivityType.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "ActivityType.h" + +#include + +namespace libkineto { + +struct ActivityTypeName { + const char* name; + ActivityType type; +}; + +static constexpr std::array map{{ + {"cpu_op", ActivityType::CPU_OP}, + {"user_annotation", ActivityType::USER_ANNOTATION}, + {"gpu_user_Annotation", ActivityType::GPU_USER_ANNOTATION}, + {"gpu_memcpy", ActivityType::GPU_MEMCPY}, + {"gpu_memset", ActivityType::GPU_MEMSET}, + {"kernel", ActivityType::CONCURRENT_KERNEL}, + {"external_correlation", ActivityType::EXTERNAL_CORRELATION}, + {"cuda_runtime", ActivityType::CUDA_RUNTIME}, + {"glow_runtime", ActivityType::GLOW_RUNTIME}, + {"cpu_instant_event", ActivityType::CPU_INSTANT_EVENT}, + {"ENUM_COUNT", ActivityType::ENUM_COUNT} +}}; + +static constexpr bool matchingOrder(int idx = 0) { + return map[idx].type == ActivityType::ENUM_COUNT || + ((idx == (int) map[idx].type) && matchingOrder(idx + 1)); +} +static_assert(matchingOrder(), "ActivityTypeName map is out of order"); + +const char* toString(ActivityType t) { + return map[(int)t].name; +} + +ActivityType toActivityType(const std::string& str) { + for (int i = 0; i < activityTypeCount; i++) { + if (str == map[i].name) { + return map[i].type; + } + } + throw std::invalid_argument(fmt::format("Invalid activity type: {}", str)); +} + +const std::array activityTypes() { + std::array res; + for (int i = 0; i < activityTypeCount; i++) { + res[i] = map[i].type; + } + return res; +} + +} // namespace libkineto diff --git a/libkineto/src/Config.cpp b/libkineto/src/Config.cpp index 4fb60e71b..154ed5ac4 100644 --- a/libkineto/src/Config.cpp +++ b/libkineto/src/Config.cpp @@ -8,8 +8,8 @@ #include "Config.h" #include -#include +#include #include #include #include @@ -17,8 +17,10 @@ #include #include #include +#include #include "Logger.h" +#include "ThreadUtil.h" using namespace std::chrono; @@ -34,7 +36,7 @@ constexpr int kDefaultActivitiesExternalAPIIterations(3); constexpr int kDefaultActivitiesExternalAPINetSizeThreshold(0); constexpr int kDefaultActivitiesExternalAPIGpuOpCountThreshold(0); constexpr int kDefaultActivitiesMaxGpuBufferSize(128 * 1024 * 1024); -constexpr seconds kDefaultActivitiesWarmupDurationSecs(15); +constexpr seconds kDefaultActivitiesWarmupDurationSecs(5); constexpr seconds kDefaultReportPeriodSecs(1); constexpr int kDefaultSamplesPerReport(1); constexpr int kDefaultMaxEventProfilersPerGpu(1); @@ -59,6 +61,7 @@ const string kHeartbeatMonitorPeriodKey = const string kActivitiesEnabledKey = "ACTIVITIES_ENABLED"; const string kActivityTypesKey = "ACTIVITY_TYPES"; const string kActivitiesLogFileKey = "ACTIVITIES_LOG_FILE"; +const string kActivitiesLogUrlKey = "ACTIVITIES_LOG_URL"; const string kActivitiesDurationKey = "ACTIVITIES_DURATION_SECS"; const string kActivitiesDurationMsecsKey = "ACTIVITIES_DURATION_MSECS"; const string kActivitiesIterationsKey = "ACTIVITIES_ITERATIONS"; @@ -70,13 +73,6 @@ const string kActivitiesWarmupDurationSecsKey = "ACTIVITIES_WARMUP_PERIOD_SECS"; const string kActivitiesMaxGpuBufferSizeKey = "ACTIVITIES_MAX_GPU_BUFFER_SIZE_MB"; -// Valid configuration file entries for activity types -const string kActivityMemcpy = "gpu_memcpy"; -const string kActivityMemset = "gpu_memset"; -const string kActivityConcurrentKernel = "concurrent_kernel"; -const string kActivityExternalCorrelation = "external_correlation"; -const string kActivityRuntime = "cuda_runtime"; - const string kDefaultLogFileFmt = "/tmp/libkineto_activities_{}.json"; // Common @@ -115,21 +111,21 @@ const string kConfigFile = "/etc/libkineto.conf"; // Max devices supported on any system constexpr uint8_t kMaxDevices = 8; -static std::map>& +static std::map>& configFactories() { - static std::map> + static std::map> factories; return factories; } void Config::addConfigFactory( std::string name, - std::function factory) { + std::function factory) { configFactories()[name] = factory; } static string defaultTraceFileName() { - return fmt::format(kDefaultLogFileFmt, getpid()); + return fmt::format(kDefaultLogFileFmt, processId()); } Config::Config() @@ -172,55 +168,38 @@ const seconds Config::maxRequestAge() const { return kMaxRequestAge; } -static char* printTime(time_point t, char* buf, int size) { +std::string getTimeStr(time_point t) { std::time_t t_c = system_clock::to_time_t(t); - std::tm lt; - localtime_r(&t_c, <); - std::strftime(buf, size, "%H:%M:%S", <); - return buf; + return fmt::format("{:%H:%M:%S}", fmt::localtime(t_c)); } static time_point handleRequestTimestamp(int64_t ms) { auto t = time_point(milliseconds(ms)); auto now = system_clock::now(); - char buf[32]; if (t > now) { throw std::invalid_argument(fmt::format( "Invalid {}: {} - time is in future", kRequestTimestampKey, - printTime(t, buf, sizeof(buf)))); + getTimeStr(t))); } else if ((now - t) > kMaxRequestAge) { throw std::invalid_argument(fmt::format( "Invalid {}: {} - time is more than {}s in the past", kRequestTimestampKey, - printTime(t, buf, sizeof(buf)), + getTimeStr(t), kMaxRequestAge.count())); } return t; } -void Config::addActivityTypes( +void Config::setActivityTypes( const std::vector& selected_activities) { + selectedActivityTypes_.clear(); if (selected_activities.size() > 0) { for (const auto& activity : selected_activities) { if (activity == "") { continue; - } else if (activity == kActivityMemcpy) { - selectedActivityTypes_.insert(ActivityType::GPU_MEMCPY); - } else if (activity == kActivityMemset) { - selectedActivityTypes_.insert(ActivityType::GPU_MEMSET); - } else if (activity == kActivityConcurrentKernel) { - selectedActivityTypes_.insert(ActivityType::CONCURRENT_KERNEL); - } else if (activity == kActivityExternalCorrelation) { - selectedActivityTypes_.insert(ActivityType::EXTERNAL_CORRELATION); - } else if (activity == kActivityRuntime) { - selectedActivityTypes_.insert(ActivityType::CUDA_RUNTIME); - } else { - throw std::invalid_argument(fmt::format( - "Invalid activity type selected: {}", - activity - )); } + selectedActivityTypes_.insert(toActivityType(activity)); } } } @@ -233,9 +212,6 @@ bool Config::handleOption(const std::string& name, std::string& val) { } else if (name == kMetricsKey) { vector metric_names = splitAndTrim(val, ','); metricNames_.insert(metric_names.begin(), metric_names.end()); - } else if (name == kActivityTypesKey) { - vector activity_types = splitAndTrim(toLower(val), ','); - addActivityTypes(activity_types); } else if (name == kSamplePeriodKey) { samplePeriod_ = milliseconds(toInt32(val)); } else if (name == kMultiplexPeriodKey) { @@ -262,6 +238,9 @@ bool Config::handleOption(const std::string& name, std::string& val) { activitiesOnDemandDuration_ = duration_cast(seconds(toInt32(val))); activitiesOnDemandTimestamp_ = timestamp(); + } else if (name == kActivityTypesKey) { + vector activity_types = splitAndTrim(toLower(val), ','); + setActivityTypes(activity_types); } else if (name == kActivitiesDurationMsecsKey) { activitiesOnDemandDuration_ = milliseconds(toInt32(val)); activitiesOnDemandTimestamp_ = timestamp(); @@ -284,6 +263,7 @@ bool Config::handleOption(const std::string& name, std::string& val) { activityProfilerEnabled_ = toBool(val); } else if (name == kActivitiesLogFileKey) { activitiesLogFile_ = val; + activitiesLogUrl_ = fmt::format("file://{}", val); activitiesOnDemandTimestamp_ = timestamp(); } else if (name == kActivitiesMaxGpuBufferSizeKey) { activitiesMaxGpuBufferSize_ = toInt32(val) * 1024 * 1024; @@ -309,7 +289,7 @@ std::chrono::milliseconds Config::activitiesOnDemandDurationDefault() const { }; void Config::updateActivityProfilerRequestReceivedTime() { - activitiesOnDemandTimestamp_ = high_resolution_clock::now(); + activitiesOnDemandTimestamp_ = system_clock::now(); } void Config::setClientDefaults() { @@ -386,9 +366,8 @@ void Config::printActivityProfilerConfig(std::ostream& s) const { << std::endl; if (hasRequestTimestamp()) { std::time_t t_c = system_clock::to_time_t(requestTimestamp()); - std::tm tm; s << "Trace request client timestamp: " - << std::put_time(localtime_r(&t_c, &tm), "%F %T") << std::endl; + << fmt::format("{:%Y-%m-%d %H:%M:%S}", fmt::localtime(t_c)) << std::endl; } s << "Trace duration: " << activitiesOnDemandDuration().count() << "ms" << std::endl; @@ -401,30 +380,12 @@ void Config::printActivityProfilerConfig(std::ostream& s) const { s << "Max GPU buffer size: " << activitiesMaxGpuBufferSize() / 1024 / 1024 << "MB" << std::endl; - s << "Enabled activities: "; + std::vector activities; for (const auto& activity : selectedActivityTypes_) { - switch(activity){ - case ActivityType::GPU_MEMCPY: - s << kActivityMemcpy << " "; - break; - case ActivityType::GPU_MEMSET: - s << kActivityMemset << " "; - break; - case ActivityType::CONCURRENT_KERNEL: - s << kActivityConcurrentKernel << " "; - break; - case ActivityType::EXTERNAL_CORRELATION: - s << kActivityExternalCorrelation << " "; - break; - case ActivityType::CUDA_RUNTIME: - s << kActivityRuntime << " "; - break; - default: - s << "UNKNOWN_ACTIVITY_NAME" << " "; - break; - } + activities.push_back(toString(activity)); } - s << std::endl; + s << "Enabled activities: " + << fmt::format("{}", fmt::join(activities, ",")) << std::endl; AbstractConfig::printActivityProfilerConfig(s); } diff --git a/libkineto/src/Config.h b/libkineto/src/Config.h index d2e1152d4..e8c94631f 100644 --- a/libkineto/src/Config.h +++ b/libkineto/src/Config.h @@ -53,6 +53,15 @@ class Config : public AbstractConfig { return activitiesLogFile_; } + // Log activitiy trace to this url + const std::string& activitiesLogUrl() const { + return activitiesLogUrl_; + } + + void setActivitiesLogUrl(const std::string& url) { + activitiesLogUrl_ = url; + } + bool activitiesLogToMemory() const { return activitiesLogToMemory_; } @@ -249,17 +258,17 @@ class Config : public AbstractConfig { return duration - (duration % alignment); } - std::chrono::time_point + std::chrono::time_point eventProfilerOnDemandStartTime() const { return eventProfilerOnDemandTimestamp_; } - std::chrono::time_point + std::chrono::time_point eventProfilerOnDemandEndTime() const { return eventProfilerOnDemandTimestamp_ + eventProfilerOnDemandDuration_; } - std::chrono::time_point + std::chrono::time_point activityProfilerRequestReceivedTime() const { return activitiesOnDemandTimestamp_; } @@ -272,7 +281,7 @@ class Config : public AbstractConfig { static void addConfigFactory( std::string name, - std::function factory); + std::function factory); void print(std::ostream& s) const; @@ -289,16 +298,14 @@ class Config : public AbstractConfig { // Adds valid activity types from the user defined string list in the // configuration file - void addActivityTypes(const std::vector& selected_activities); + void setActivityTypes(const std::vector& selected_activities); // Sets the default activity types to be traced void selectDefaultActivityTypes() { // If the user has not specified an activity list, add all types - selectedActivityTypes_.insert(ActivityType::GPU_MEMCPY); - selectedActivityTypes_.insert(ActivityType::GPU_MEMSET); - selectedActivityTypes_.insert(ActivityType::CONCURRENT_KERNEL); - selectedActivityTypes_.insert(ActivityType::EXTERNAL_CORRELATION); - selectedActivityTypes_.insert(ActivityType::CUDA_RUNTIME); + for (ActivityType t : activityTypes()) { + selectedActivityTypes_.insert(t); + } } int verboseLogLevel_; @@ -315,7 +322,7 @@ class Config : public AbstractConfig { // On-demand duration std::chrono::seconds eventProfilerOnDemandDuration_; // Last on-demand request - std::chrono::time_point + std::chrono::time_point eventProfilerOnDemandTimestamp_; int eventProfilerMaxInstancesPerGpu_; @@ -337,6 +344,8 @@ class Config : public AbstractConfig { // The activity profiler settings are all on-demand std::string activitiesLogFile_; + std::string activitiesLogUrl_; + // Log activities to memory buffer bool activitiesLogToMemory_{false}; @@ -355,7 +364,7 @@ class Config : public AbstractConfig { // Only profile nets with at least this many GPU operators int activitiesExternalAPIGpuOpCountThreshold_; // Last activity profiler request - std::chrono::time_point + std::chrono::time_point activitiesOnDemandTimestamp_; // Synchronized start timestamp diff --git a/libkineto/src/ConfigLoader.cpp b/libkineto/src/ConfigLoader.cpp index 79be0725b..e2726eeb2 100644 --- a/libkineto/src/ConfigLoader.cpp +++ b/libkineto/src/ConfigLoader.cpp @@ -7,7 +7,10 @@ #include "ConfigLoader.h" +#ifdef __linux__ #include +#endif + #include #include #include @@ -26,25 +29,37 @@ namespace KINETO_NAMESPACE { using namespace libkineto; const string kConfigFileEnvVar = "KINETO_CONFIG"; +#ifdef __linux__ const string kConfigFile = "/etc/libkineto.conf"; const string kOnDemandConfigFile = "/tmp/libkineto.conf"; +#else +const string kConfigFile = "libkineto.conf"; +const string kOnDemandConfigFile = "libkineto.conf"; +#endif constexpr std::chrono::seconds kConfigUpdateIntervalSecs(300); constexpr std::chrono::seconds kOnDemandConfigUpdateIntervalSecs(5); constexpr std::chrono::seconds kOnDemandConfigVerboseLogDurationSecs(120); +#ifdef __linux__ static struct sigaction originalUsr2Handler = {}; +#endif // Use SIGUSR2 to initiate profiling. // Look for an on-demand config file. // If none is found, default to base config. // Try to not affect existing handlers static bool hasOriginalSignalHandler() { +#ifdef __linux__ return originalUsr2Handler.sa_handler != nullptr || originalUsr2Handler.sa_sigaction != nullptr; +#else + return false; +#endif } static void handle_signal(int signal) { +#ifdef __linux__ if (signal == SIGUSR2) { ConfigLoader::instance().handleOnDemandSignal(); if (hasOriginalSignalHandler()) { @@ -55,9 +70,11 @@ static void handle_signal(int signal) { sigaction(SIGUSR2, &act, &originalUsr2Handler); } } +#endif } static void setupSignalHandler(bool enableSigUsr2) { +#ifdef __linux__ if (enableSigUsr2) { struct sigaction act = {}; act.sa_handler = &handle_signal; @@ -72,6 +89,7 @@ static void setupSignalHandler(bool enableSigUsr2) { sigaction(SIGUSR2, &originalUsr2Handler, nullptr); originalUsr2Handler = {}; } +#endif } // return an empty string if reading gets any errors. Otherwise a config string. @@ -83,8 +101,8 @@ static std::string readConfigFromConfigFile(const char* filename) { conf.assign( std::istreambuf_iterator(file), std::istreambuf_iterator()); } catch (std::exception& e) { - LOG(ERROR) << "Error in reading libkineto config from config file: " - << e.what(); + VLOG(0) << "Error reading " << filename << ": " + << e.what(); conf = ""; } return conf; @@ -108,7 +126,7 @@ ConfigLoader& ConfigLoader::instance() { // return an empty string if polling gets any errors. Otherwise a config string. std::string ConfigLoader::readOnDemandConfigFromDaemon( - time_point now) { + time_point now) { if (!daemonConfigLoader_) { return ""; } @@ -137,11 +155,12 @@ ConfigLoader::ConfigLoader(LibkinetoApi& api) if (configFileName_ == nullptr) { configFileName_ = kConfigFile.data(); } - config_.parse(readConfigFromConfigFile(configFileName_)); - SET_LOG_VERBOSITY_LEVEL(config_.verboseLogLevel(), config_.verboseLogModules()); - setupSignalHandler(config_.sigUsr2Enabled()); - if (daemonConfigLoaderFactory && daemonConfigLoaderFactory()) { + if (daemonConfigLoaderFactory()) { daemonConfigLoader_ = daemonConfigLoaderFactory()(); + } + updateBaseConfig(); + SET_LOG_VERBOSITY_LEVEL(config_.verboseLogLevel(), config_.verboseLogModules()); + if (daemonConfigLoader_) { daemonConfigLoader_->setCommunicationFabric(config_.ipcFabricEnabled()); } updateThread_ = @@ -168,7 +187,15 @@ void ConfigLoader::handleOnDemandSignal() { } void ConfigLoader::updateBaseConfig() { - const std::string config_str = readConfigFromConfigFile(configFileName_); + // First try reading local config file + // If that fails, read from daemon + // TODO: Invert these once daemon path fully rolled out + std::string config_str = readConfigFromConfigFile(configFileName_); + if (config_str.empty() && daemonConfigLoader_) { + // If local config file was not successfully loaded (e.g. not found) + // then try the daemon + config_str = daemonConfigLoader_->readBaseConfig(); + } if (config_str != config_.source()) { std::lock_guard lock(configLock_); config_.~Config(); @@ -177,12 +204,12 @@ void ConfigLoader::updateBaseConfig() { if (daemonConfigLoader_) { daemonConfigLoader_->setCommunicationFabric(config_.ipcFabricEnabled()); } + setupSignalHandler(config_.sigUsr2Enabled()); } - setupSignalHandler(config_.sigUsr2Enabled()); } void ConfigLoader::configureFromSignal( - time_point now, + time_point now, Config& config) { LOG(INFO) << "Received on-demand profiling signal, " << "reading config from " << kOnDemandConfigFile.data(); @@ -215,7 +242,7 @@ void ConfigLoader::configureFromSignal( } void ConfigLoader::configureFromDaemon( - time_point now, + time_point now, Config& config) { const std::string config_str = readOnDemandConfigFromDaemon(now); LOG_IF(INFO, !config_str.empty()) << "Received config from dyno:\n" @@ -241,14 +268,19 @@ void ConfigLoader::configureFromDaemon( } void ConfigLoader::updateConfigThread() { - auto now = high_resolution_clock::now(); + auto now = system_clock::now(); auto next_config_load_time = now + configUpdateIntervalSecs_; auto next_on_demand_load_time = now + onDemandConfigUpdateIntervalSecs_; auto next_log_level_reset_time = now; - seconds interval = - std::min(configUpdateIntervalSecs_, onDemandConfigUpdateIntervalSecs_); + seconds interval = configUpdateIntervalSecs_; + if (interval > onDemandConfigUpdateIntervalSecs_) { + interval = onDemandConfigUpdateIntervalSecs_; + } auto onDemandConfig = std::make_unique(); + // Refresh config before starting loop + updateBaseConfig(); + // This can potentially sleep for long periods of time, so allow // the desctructor to wake it to avoid a 5-minute long destruct period. for (;;) { @@ -259,7 +291,7 @@ void ConfigLoader::updateConfigThread() { if (stopFlag_) { break; } - now = high_resolution_clock::now(); + now = system_clock::now(); if (now > next_config_load_time) { updateBaseConfig(); next_config_load_time = now + configUpdateIntervalSecs_; diff --git a/libkineto/src/ConfigLoader.h b/libkineto/src/ConfigLoader.h index 6b1fdf072..eb953a6be 100644 --- a/libkineto/src/ConfigLoader.h +++ b/libkineto/src/ConfigLoader.h @@ -57,12 +57,12 @@ class ConfigLoader { // Create configuration when receiving SIGUSR2 void configureFromSignal( - std::chrono::time_point now, + std::chrono::time_point now, Config& config); // Create configuration when receiving request from a daemon void configureFromDaemon( - std::chrono::time_point now, + std::chrono::time_point now, Config& config); inline bool eventProfilerRequest(const Config& config) { @@ -72,7 +72,7 @@ class ConfigLoader { } std::string readOnDemandConfigFromDaemon( - std::chrono::time_point now); + std::chrono::time_point now); LibkinetoApi& libkinetoApi_; std::mutex configLock_; diff --git a/libkineto/src/CudaDeviceProperties.cpp b/libkineto/src/CudaDeviceProperties.cpp new file mode 100644 index 000000000..19591da93 --- /dev/null +++ b/libkineto/src/CudaDeviceProperties.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) Kineto Contributors + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "CudaDeviceProperties.h" + +#include +#include + +#include +#include + +#include "Logger.h" + +namespace KINETO_NAMESPACE { + +static const std::vector createDeviceProps() { + std::vector props; + int device_count; + cudaError_t error_id = cudaGetDeviceCount(&device_count); + // Return empty vector if error. + if (error_id != cudaSuccess) { + LOG(ERROR) << "cudaGetDeviceCount failed with code " << error_id; + return {}; + } + VLOG(0) << "Device count is " << device_count; + for (size_t i = 0; i < device_count; ++i) { + cudaDeviceProp prop; + error_id = cudaGetDeviceProperties(&prop, i); + // Return empty vector if any device property fail to get. + if (error_id != cudaSuccess) { + LOG(ERROR) << "cudaGetDeviceProperties failed with " << error_id; + return {}; + } + props.push_back(prop); + } + return props; +} + +static const std::vector& deviceProps() { + static const std::vector props = createDeviceProps(); + return props; +} + +static const std::string createDevicePropertiesJson( + size_t id, const cudaDeviceProp& props) { + return fmt::format(R"JSON( + {{ + "id": {}, "name": "{}", "totalGlobalMem": {}, + "computeMajor": {}, "computeMinor": {}, + "maxThreadsPerBlock": {}, "maxThreadsPerMultiprocessor": {}, + "regsPerBlock": {}, "regsPerMultiprocessor": {}, "warpSize": {}, + "sharedMemPerBlock": {}, "sharedMemPerMultiprocessor": {}, + "numSms": {}, "sharedMemPerBlockOptin": {} + }})JSON", + id, props.name, props.totalGlobalMem, + props.major, props.minor, + props.maxThreadsPerBlock, props.maxThreadsPerMultiProcessor, + props.regsPerBlock, props.regsPerMultiprocessor, props.warpSize, + props.sharedMemPerBlock, props.sharedMemPerMultiprocessor, + props.multiProcessorCount, props.sharedMemPerBlockOptin); +} + +static const std::string createDevicePropertiesJson() { + std::vector jsonProps; + const auto& props = deviceProps(); + for (size_t i = 0; i < props.size(); i++) { + jsonProps.push_back(createDevicePropertiesJson(i, props[i])); + } + return fmt::format("{}", fmt::join(jsonProps, ",")); +} + +const std::string& devicePropertiesJson() { + static std::string devicePropsJson = createDevicePropertiesJson(); + return devicePropsJson; +} + +int smCount(uint32_t deviceId) { + const std::vector &props = deviceProps(); + return deviceId >= props.size() ? 0 : + props[deviceId].multiProcessorCount; +} + +float kernelOccupancy( + uint32_t deviceId, + uint16_t registersPerThread, + int32_t staticSharedMemory, + int32_t dynamicSharedMemory, + int32_t blockX, + int32_t blockY, + int32_t blockZ, + float blocksPerSm) { + // Calculate occupancy + float occupancy = -1.0; + const std::vector &props = deviceProps(); + if (deviceId < props.size()) { + cudaOccFuncAttributes occFuncAttr; + occFuncAttr.maxThreadsPerBlock = INT_MAX; + occFuncAttr.numRegs = registersPerThread; + occFuncAttr.sharedSizeBytes = staticSharedMemory; + occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF; + occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT; + occFuncAttr.maxDynamicSharedSizeBytes = 0; + const cudaOccDeviceState occDeviceState = {}; + int blockSize = blockX * blockY * blockZ; + size_t dynamicSmemSize = dynamicSharedMemory; + cudaOccResult occ_result; + cudaOccDeviceProp prop(props[deviceId]); + cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor( + &occ_result, &prop, &occFuncAttr, &occDeviceState, + blockSize, dynamicSmemSize); + if (status == CUDA_OCC_SUCCESS) { + if (occ_result.activeBlocksPerMultiprocessor < blocksPerSm) { + blocksPerSm = occ_result.activeBlocksPerMultiprocessor; + } + occupancy = blocksPerSm * blockSize / + (float) props[deviceId].maxThreadsPerMultiProcessor; + } else { + LOG_EVERY_N(ERROR, 1000) << "Failed to calculate occupancy, status = " + << status; + } + } + return occupancy; +} + +} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/CudaDeviceProperties.h b/libkineto/src/CudaDeviceProperties.h new file mode 100644 index 000000000..b731fde0c --- /dev/null +++ b/libkineto/src/CudaDeviceProperties.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) Kineto Contributors + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include + +namespace KINETO_NAMESPACE { + +int smCount(uint32_t deviceId); + +// Return estimated achieved occupancy for a kernel +float kernelOccupancy( + uint32_t deviceId, + uint16_t registersPerThread, + int32_t staticSharedMemory, + int32_t dynamicSharedMemory, + int32_t blockX, + int32_t blockY, + int32_t blockZ, + float blocks_per_sm); + +// Return compute properties for each device as a json string +const std::string& devicePropertiesJson(); + +} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/CuptiActivity.h b/libkineto/src/CuptiActivity.h index b05e969a2..4ac38fe62 100644 --- a/libkineto/src/CuptiActivity.h +++ b/libkineto/src/CuptiActivity.h @@ -8,10 +8,9 @@ #pragma once #include -#include -#include #include "TraceActivity.h" +#include "ThreadUtil.h" #include "cupti_strings.h" namespace libkineto { @@ -21,6 +20,7 @@ namespace libkineto { namespace KINETO_NAMESPACE { using namespace libkineto; +struct TraceSpan; // These classes wrap the various CUPTI activity types // into subclasses of TraceActivity so that they can all be accessed @@ -31,13 +31,16 @@ template struct CuptiActivity : public TraceActivity { explicit CuptiActivity(const T* activity, const TraceActivity& linked) : activity_(*activity), linked_(linked) {} - int64_t timestamp() const override {return nsToUs(activity_.start);} + int64_t timestamp() const override { + return nsToUs(activity_.start); + } int64_t duration() const override { return nsToUs(activity_.end - activity_.start); } int64_t correlationId() const override {return activity_.correlationId;} const T& raw() const {return activity_;} const TraceActivity* linkedActivity() const override {return &linked_;} + const TraceSpan* traceSpan() const override {return nullptr;} protected: const T& activity_; @@ -51,7 +54,7 @@ struct RuntimeActivity : public CuptiActivity { const TraceActivity& linked, int32_t threadId) : CuptiActivity(activity, linked), threadId_(threadId) {} - int64_t deviceId() const override {return cachedPid();} + int64_t deviceId() const override {return processId();} int64_t resourceId() const override {return threadId_;} ActivityType type() const override {return ActivityType::CUDA_RUNTIME;} const std::string name() const override {return runtimeCbidName(activity_.cbid);} @@ -76,4 +79,3 @@ struct GpuActivity : public CuptiActivity { }; } // namespace KINETO_NAMESPACE - diff --git a/libkineto/src/CuptiActivityBuffer.h b/libkineto/src/CuptiActivityBuffer.h index 8dad1aa0f..d444b9399 100644 --- a/libkineto/src/CuptiActivityBuffer.h +++ b/libkineto/src/CuptiActivityBuffer.h @@ -8,28 +8,45 @@ #pragma once #include -#include -#include +#include #include +#include +#include +#include +#include namespace KINETO_NAMESPACE { class CuptiActivityBuffer { public: - // data must be allocated using malloc. - // Ownership is transferred to this object. - CuptiActivityBuffer(uint8_t* data, size_t validSize) - : data(data), validSize(validSize) {} + explicit CuptiActivityBuffer(size_t size) : size_(size) { + buf_.reserve(size); + } + CuptiActivityBuffer() = delete; + CuptiActivityBuffer& operator=(const CuptiActivityBuffer&) = delete; + CuptiActivityBuffer(CuptiActivityBuffer&&) = default; + CuptiActivityBuffer& operator=(CuptiActivityBuffer&&) = default; + + size_t size() const { + return size_; + } + + void setSize(size_t size) { + assert(size <= buf_.capacity()); + size_ = size; + } - ~CuptiActivityBuffer() { - free(data); + uint8_t* data() { + return buf_.data(); } - // Allocated by malloc - uint8_t* data{nullptr}; + private: - // Number of bytes used - size_t validSize; + std::vector buf_; + size_t size_; }; +using CuptiActivityBufferMap = + std::map>; + } // namespace KINETO_NAMESPACE diff --git a/libkineto/src/CuptiActivityInterface.cpp b/libkineto/src/CuptiActivityInterface.cpp index d34983710..987653964 100644 --- a/libkineto/src/CuptiActivityInterface.cpp +++ b/libkineto/src/CuptiActivityInterface.cpp @@ -7,10 +7,10 @@ #include "CuptiActivityInterface.h" +#include #include #include "cupti_call.h" - #include "Logger.h" using namespace std::chrono; @@ -30,6 +30,9 @@ CuptiActivityInterface& CuptiActivityInterface::singleton() { void CuptiActivityInterface::pushCorrelationID(int id, CorrelationFlowType type) { #ifdef HAS_CUPTI + if (!singleton().externalCorrelationEnabled_) { + return; + } VLOG(2) << "pushCorrelationID(" << id << ")"; switch(type) { case Default: @@ -45,6 +48,9 @@ void CuptiActivityInterface::pushCorrelationID(int id, CorrelationFlowType type) void CuptiActivityInterface::popCorrelationID(CorrelationFlowType type) { #ifdef HAS_CUPTI + if (!singleton().externalCorrelationEnabled_) { + return; + } switch(type) { case Default: CUPTI_CALL(cuptiActivityPopExternalCorrelationId( @@ -121,40 +127,53 @@ void CUPTIAPI CuptiActivityInterface::bufferRequestedTrampoline( singleton().bufferRequested(buffer, size, maxNumRecords); } -void CuptiActivityInterface::bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { - if (allocatedGpuBufferCount >= maxGpuBufferCount_) { +void CuptiActivityInterface::bufferRequested( + uint8_t** buffer, size_t* size, size_t* maxNumRecords) { + std::lock_guard guard(mutex_); + if (allocatedGpuTraceBuffers_.size() >= maxGpuBufferCount_) { stopCollection = true; LOG(WARNING) << "Exceeded max GPU buffer count (" - << allocatedGpuBufferCount + << allocatedGpuTraceBuffers_.size() + << " > " << maxGpuBufferCount_ << ") - terminating tracing"; } + auto buf = std::make_unique(kBufSize); + *buffer = buf->data(); *size = kBufSize; - *maxNumRecords = 0; - // TODO(xdwang): create a list of buffers in advance so that we can reuse. - // This saves time to dynamically allocate new buffers (which could be costly - // if we allocated new space from the heap) - *buffer = (uint8_t*) malloc(kBufSize); + allocatedGpuTraceBuffers_[*buffer] = std::move(buf); - allocatedGpuBufferCount++; + *maxNumRecords = 0; } #endif -std::unique_ptr> CuptiActivityInterface::activityBuffers() { +std::unique_ptr +CuptiActivityInterface::activityBuffers() { + { + std::lock_guard guard(mutex_); + if (allocatedGpuTraceBuffers_.empty()) { + return nullptr; + } + } + #ifdef HAS_CUPTI VLOG(1) << "Flushing GPU activity buffers"; - time_point t1; + time_point t1; if (VLOG_IS_ON(1)) { - t1 = high_resolution_clock::now(); + t1 = system_clock::now(); } + // Can't hold mutex_ during this call, since bufferCompleted + // will be called by libcupti and mutex_ is acquired there. CUPTI_CALL(cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED)); if (VLOG_IS_ON(1)) { flushOverhead = - duration_cast(high_resolution_clock::now() - t1).count(); + duration_cast(system_clock::now() - t1).count(); } #endif - return std::move(gpuTraceBuffers_); + std::lock_guard guard(mutex_); + // Transfer ownership of buffers to caller. A new map is created on-demand. + return std::move(readyGpuTraceBuffers_); } #ifdef HAS_CUPTI @@ -175,35 +194,37 @@ int CuptiActivityInterface::processActivitiesForBuffer( #endif const std::pair CuptiActivityInterface::processActivities( - std::list& buffers, + CuptiActivityBufferMap& buffers, std::function handler) { std::pair res{0, 0}; #ifdef HAS_CUPTI - for (auto& buf : buffers) { + for (auto& pair : buffers) { // No lock needed - only accessed from this thread - res.first += processActivitiesForBuffer(buf.data, buf.validSize, handler); - res.second += buf.validSize; + auto& buf = pair.second; + res.first += processActivitiesForBuffer(buf->data(), buf->size(), handler); + res.second += buf->size(); } #endif return res; } void CuptiActivityInterface::clearActivities() { + { + std::lock_guard guard(mutex_); + if (allocatedGpuTraceBuffers_.empty()) { + return; + } + } + // Can't hold mutex_ during this call, since bufferCompleted + // will be called by libcupti and mutex_ is acquired there. CUPTI_CALL(cuptiActivityFlushAll(0)); // FIXME: We might want to make sure we reuse // the same memory during warmup and tracing. // Also, try to use the amount of memory required // for active tracing during warmup. - if (gpuTraceBuffers_) { - gpuTraceBuffers_->clear(); - } -} - -void CuptiActivityInterface::addActivityBuffer(uint8_t* buffer, size_t validSize) { - if (!gpuTraceBuffers_) { - gpuTraceBuffers_ = std::make_unique>(); - } - gpuTraceBuffers_->emplace_back(buffer, validSize); + std::lock_guard guard(mutex_); + // Throw away ready buffers as a result of above flush + readyGpuTraceBuffers_ = nullptr; } #ifdef HAS_CUPTI @@ -222,12 +243,22 @@ void CuptiActivityInterface::bufferCompleted( uint8_t* buffer, size_t /* unused */, size_t validSize) { - allocatedGpuBufferCount--; - // lock should be uncessary here, because gpuTraceBuffers is read/written by - // profilerLoop only. CUPTI should handle the cuptiActivityFlushAll and - // bufferCompleted, so that there is no concurrency issues - addActivityBuffer(buffer, validSize); + std::lock_guard guard(mutex_); + auto it = allocatedGpuTraceBuffers_.find(buffer); + if (it == allocatedGpuTraceBuffers_.end()) { + LOG(ERROR) << "bufferCompleted called with unknown buffer: " + << (void*) buffer; + return; + } + + if (!readyGpuTraceBuffers_) { + readyGpuTraceBuffers_ = std::make_unique(); + } + // Set valid size of buffer before moving to ready map + it->second->setSize(validSize); + (*readyGpuTraceBuffers_)[it->first] = std::move(it->second); + allocatedGpuTraceBuffers_.erase(it); // report any records dropped from the queue; to avoid unnecessary cupti // API calls, we make it report only in verbose mode (it doesn't happen @@ -251,6 +282,7 @@ void CuptiActivityInterface::enableCuptiActivities( cuptiActivityRegisterCallbacks(bufferRequestedTrampoline, bufferCompletedTrampoline)); } + externalCorrelationEnabled_ = false; for (const auto& activity : selected_activities) { if (activity == ActivityType::GPU_MEMCPY) { CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); @@ -263,6 +295,7 @@ void CuptiActivityInterface::enableCuptiActivities( } if (activity == ActivityType::EXTERNAL_CORRELATION) { CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); + externalCorrelationEnabled_ = true; } if (activity == ActivityType::CUDA_RUNTIME) { CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); @@ -294,6 +327,7 @@ void CuptiActivityInterface::disableCuptiActivities( CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); } } + externalCorrelationEnabled_ = false; #endif } diff --git a/libkineto/src/CuptiActivityInterface.h b/libkineto/src/CuptiActivityInterface.h index d1684bdb2..af02570df 100644 --- a/libkineto/src/CuptiActivityInterface.h +++ b/libkineto/src/CuptiActivityInterface.h @@ -17,6 +17,7 @@ #include #include #include +#include #include namespace KINETO_NAMESPACE { @@ -52,11 +53,10 @@ class CuptiActivityInterface { const std::set& selected_activities); void clearActivities(); - void addActivityBuffer(uint8_t* buffer, size_t validSize); - virtual std::unique_ptr> activityBuffers(); + virtual std::unique_ptr activityBuffers(); virtual const std::pair processActivities( - std::list& buffers, + CuptiActivityBufferMap&, std::function handler); void setMaxBufferSize(int size); @@ -81,8 +81,10 @@ class CuptiActivityInterface { #endif // HAS_CUPTI int maxGpuBufferCount_{0}; - int allocatedGpuBufferCount{0}; - std::unique_ptr> gpuTraceBuffers_; + CuptiActivityBufferMap allocatedGpuTraceBuffers_; + std::unique_ptr readyGpuTraceBuffers_; + std::mutex mutex_; + bool externalCorrelationEnabled_{false}; protected: #ifdef HAS_CUPTI diff --git a/libkineto/src/DaemonConfigLoader.h b/libkineto/src/DaemonConfigLoader.h index 33947f690..072338ee2 100644 --- a/libkineto/src/DaemonConfigLoader.h +++ b/libkineto/src/DaemonConfigLoader.h @@ -16,6 +16,9 @@ class DaemonConfigLoader { public: virtual ~DaemonConfigLoader() {} + // Return the base config from the daemon + virtual std::string readBaseConfig() = 0; + // Return a configuration string from the daemon, if one has been posted. virtual std::string readOnDemandConfig(bool events, bool activities) = 0; diff --git a/libkineto/src/Demangle.cpp b/libkineto/src/Demangle.cpp index f508d5d81..1e52d5167 100644 --- a/libkineto/src/Demangle.cpp +++ b/libkineto/src/Demangle.cpp @@ -7,7 +7,9 @@ #include "Demangle.h" +#ifndef _MSC_VER #include +#endif #include #include @@ -16,6 +18,7 @@ namespace KINETO_NAMESPACE { static constexpr int kMaxSymbolSize = 1024; std::string demangle(const char* name) { +#ifndef _MSC_VER if (!name) { return ""; } @@ -34,6 +37,14 @@ std::string demangle(const char* name) { // The returned buffer must be freed! free(demangled); return res; +#else + // TODO: demangling on Windows + if (!name) { + return ""; + } else { + return name; + } +#endif } } // namespace KINETO_NAMESPACE diff --git a/libkineto/src/EventProfiler.cpp b/libkineto/src/EventProfiler.cpp index 4216097f8..2fe006c60 100644 --- a/libkineto/src/EventProfiler.cpp +++ b/libkineto/src/EventProfiler.cpp @@ -192,7 +192,7 @@ void EventGroupSet::setEnabled(bool enabled) { // Collect counter values for each counter in group set void EventGroupSet::collectSample() { - auto timestamp = high_resolution_clock::now(); + auto timestamp = system_clock::now(); for (int g = 0; g < set_.numEventGroups; g++) { CUpti_EventGroup grp = set_.eventGroups[g]; for (const auto& id : cuptiEvents_.eventsInGroup(grp)) { @@ -215,7 +215,7 @@ void EventGroupSet::collectSample() { } if (VLOG_IS_ON(1)) { - auto t2 = high_resolution_clock::now(); + auto t2 = system_clock::now(); VLOG(1) << "Device " << cuptiEvents_.device() << " Sample (us): " << duration_cast(t2 - timestamp).count(); } @@ -320,7 +320,7 @@ static unique_ptr alignAndValidateConfigs( Config& base, Config& onDemand) { if (onDemand.eventProfilerOnDemandDuration().count() == 0 || - high_resolution_clock::now() > + system_clock::now() > (onDemand.eventProfilerOnDemandStartTime() + onDemand.eventProfilerOnDemandDuration())) { base.validate(); @@ -530,7 +530,7 @@ void EventProfiler::printAllSamples(ostream& s, CUdevice device) const { void EventProfiler::enableNextCounterSet() { if (sets_.size() > 1) { - auto t1 = high_resolution_clock::now(); + auto t1 = system_clock::now(); VLOG(1) << "Disabling set " << curEnabledSet_; sets_[curEnabledSet_].setEnabled(false); @@ -539,7 +539,7 @@ void EventProfiler::enableNextCounterSet() { sets_[curEnabledSet_].setEnabled(true); if (VLOG_IS_ON(1)) { - auto t2 = high_resolution_clock::now(); + auto t2 = system_clock::now(); VLOG(1) << "Switch (us): " << duration_cast(t2 - t1).count(); } diff --git a/libkineto/src/EventProfilerController.cpp b/libkineto/src/EventProfilerController.cpp index 419c9bbce..596c78b60 100644 --- a/libkineto/src/EventProfilerController.cpp +++ b/libkineto/src/EventProfilerController.cpp @@ -10,17 +10,15 @@ #include #include #include -#include -#include #include "ConfigLoader.h" #include "CuptiEventInterface.h" #include "CuptiMetricInterface.h" #include "EventProfiler.h" -#include "ThreadName.h" #include "output_csv.h" #include "Logger.h" +#include "ThreadUtil.h" using namespace std::chrono; using std::unique_ptr; @@ -75,8 +73,12 @@ vector>& onDemandLoggers( return res; } +} // anon namespace + // Keep an eye on profiling threads. // We've observed deadlocks in Cuda11 in libcuda / libcupti.. +namespace detail { + class HeartbeatMonitor { public: @@ -90,7 +92,7 @@ class HeartbeatMonitor { } void profilerHeartbeat() { - pid_t tid = syscall(SYS_gettid); + int32_t tid = systemThreadId(); std::lock_guard lock(mutex_); profilerAliveMap_[tid]++; } @@ -119,7 +121,9 @@ class HeartbeatMonitor { auto cv_status = condVar_.wait_for(lock, seconds(period_)); // Don't perform check on spurious wakeup or on notify if (cv_status == std::cv_status::timeout) { - for (auto& [tid, i] : profilerAliveMap_) { + for (auto& pair : profilerAliveMap_) { + int32_t tid = pair.first; + int& i = pair.second; if (i == 0) { LOG(ERROR) << "Thread " << tid << " appears stuck!"; } @@ -149,7 +153,7 @@ class HeartbeatMonitor { } } - std::map profilerAliveMap_; + std::map profilerAliveMap_; std::unique_ptr monitorThread_; std::mutex mutex_; std::condition_variable condVar_; @@ -157,6 +161,9 @@ class HeartbeatMonitor { seconds period_{0}; }; +} // namespace detail + +namespace { // Profiler map singleton std::map>& profilerMap() { static std::map> instance; @@ -174,7 +181,7 @@ void reportLateSample( } void configureHeartbeatMonitor( - HeartbeatMonitor& monitor, const Config& base, const Config& onDemand) { + detail::HeartbeatMonitor& monitor, const Config& base, const Config& onDemand) { seconds base_period = base.eventProfilerHeartbeatMonitorPeriod(); seconds on_demand_period = @@ -198,7 +205,7 @@ void EventProfilerController::addOnDemandLoggerFactory( EventProfilerController::EventProfilerController( CUcontext context, ConfigLoader& configLoader, - HeartbeatMonitor& heartbeatMonitor) + detail::HeartbeatMonitor& heartbeatMonitor) : configLoader_(configLoader), heartbeatMonitor_(heartbeatMonitor) { auto cupti_events = std::make_unique(context); auto cupti_metrics = @@ -226,7 +233,7 @@ EventProfilerController::~EventProfilerController() { void EventProfilerController::start(CUcontext ctx) { profilerMap()[ctx] = unique_ptr( new EventProfilerController( - ctx, ConfigLoader::instance(), HeartbeatMonitor::instance())); + ctx, ConfigLoader::instance(), detail::HeartbeatMonitor::instance())); } // Must be called under lock @@ -266,10 +273,10 @@ void EventProfilerController::profilerLoop() { auto on_demand_config = std::make_unique(); - time_point next_sample_time; - time_point next_report_time; - time_point next_on_demand_report_time; - time_point next_multiplex_time; + time_point next_sample_time; + time_point next_report_time; + time_point next_on_demand_report_time; + time_point next_multiplex_time; bool reconfigure = true; bool restart = true; int report_count = 0; @@ -289,7 +296,7 @@ void EventProfilerController::profilerLoop() { reconfigure = true; } - auto now = high_resolution_clock::now(); + auto now = system_clock::now(); if (on_demand_config->eventProfilerOnDemandDuration().count() > 0 && now > (on_demand_config->eventProfilerOnDemandStartTime() + on_demand_config->eventProfilerOnDemandDuration())) { @@ -314,7 +321,7 @@ void EventProfilerController::profilerLoop() { } if (restart) { - now = high_resolution_clock::now(); + now = system_clock::now(); next_sample_time = now + profiler_->samplePeriod(); next_report_time = now + profiler_->reportPeriod(); next_on_demand_report_time = now + profiler_->onDemandReportPeriod(); @@ -330,13 +337,13 @@ void EventProfilerController::profilerLoop() { while (now < next_sample_time) { /* sleep override */ std::this_thread::sleep_for(next_sample_time - now); - now = high_resolution_clock::now(); + now = system_clock::now(); } int sleep_time = duration_cast(now - start_sleep).count(); auto start_sample = now; profiler_->collectSample(); - now = high_resolution_clock::now(); + now = system_clock::now(); int sample_time = duration_cast(now - start_sample).count(); next_sample_time += profiler_->samplePeriod(); @@ -359,7 +366,7 @@ void EventProfilerController::profilerLoop() { next_on_demand_report_time += profiler_->onDemandReportPeriod(); } profiler_->eraseReportedSamples(); - now = high_resolution_clock::now(); + now = system_clock::now(); int report_time = duration_cast(now - start_report).count(); if (now > next_sample_time) { @@ -373,7 +380,7 @@ void EventProfilerController::profilerLoop() { profiler_->enableNextCounterSet(); next_multiplex_time += profiler_->multiplexPeriod(); } - now = high_resolution_clock::now(); + now = system_clock::now(); int multiplex_time = duration_cast(now - start_multiplex).count(); diff --git a/libkineto/src/EventProfilerController.h b/libkineto/src/EventProfilerController.h index e8dd95120..6e9703fb6 100644 --- a/libkineto/src/EventProfilerController.h +++ b/libkineto/src/EventProfilerController.h @@ -21,7 +21,7 @@ class ConfigLoader; class EventProfiler; class SampleListener; -namespace { +namespace detail { class HeartbeatMonitor; } @@ -45,12 +45,12 @@ class EventProfilerController { explicit EventProfilerController( CUcontext context, ConfigLoader& configLoader, - HeartbeatMonitor& heartbeatMonitor); + detail::HeartbeatMonitor& heartbeatMonitor); bool enableForDevice(Config& cfg); void profilerLoop(); ConfigLoader& configLoader_; - HeartbeatMonitor& heartbeatMonitor_; + detail::HeartbeatMonitor& heartbeatMonitor_; std::unique_ptr profiler_; std::unique_ptr profilerThread_; std::atomic_bool stopRunloop_{false}; diff --git a/libkineto/src/GenericTraceActivity.cpp b/libkineto/src/GenericTraceActivity.cpp index b893fff4c..8df6581a6 100644 --- a/libkineto/src/GenericTraceActivity.cpp +++ b/libkineto/src/GenericTraceActivity.cpp @@ -8,10 +8,8 @@ #include "GenericTraceActivity.h" #include "output_base.h" -using namespace libkineto; - -namespace KINETO_NAMESPACE { +namespace libkineto { void GenericTraceActivity::log(ActivityLogger& logger) const { logger.handleGenericActivity(*this); } -} // namespace KINETO_NAMESPACE +} // namespace libkineto diff --git a/libkineto/src/GenericTraceActivity.h b/libkineto/src/GenericTraceActivity.h deleted file mode 100644 index f9a0e27ec..000000000 --- a/libkineto/src/GenericTraceActivity.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include - -#include "TraceActivity.h" - -namespace KINETO_NAMESPACE { - -// A generic trace activity that can be freely modified -struct GenericTraceActivity : libkineto::TraceActivity { - int64_t deviceId() const override { - return device; - } - - int64_t resourceId() const override { - return resource; - } - - int64_t timestamp() const override { - return startTime; - } - - int64_t duration() const override { - return endTime - startTime; - } - - int64_t correlationId() const override { - return correlation; - } - - libkineto::ActivityType type() const override { - return activityType; - } - - const std::string name() const override { - return activityName; - } - - const libkineto::TraceActivity* linkedActivity() const override { - return linked; - } - - void log(libkineto::ActivityLogger& logger) const override; - - int64_t device; - pthread_t resource; - - int64_t startTime; - int64_t endTime; - int64_t correlation; - - libkineto::ActivityType activityType; - std::string activityName; - - libkineto::TraceActivity* linked; -}; - -} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/Logger.cpp b/libkineto/src/Logger.cpp index 4dac80770..a65dc67ee 100644 --- a/libkineto/src/Logger.cpp +++ b/libkineto/src/Logger.cpp @@ -9,14 +9,18 @@ #ifndef USE_GOOGLE_LOG -#include -#include #include #include #include #include +#include -namespace KINETO_NAMESPACE { +#include +#include + +#include "ThreadUtil.h" + +namespace libkineto { int Logger::severityLevel_{VERBOSE}; int Logger::verboseLogLevel_{-1}; @@ -25,6 +29,9 @@ uint64_t Logger::verboseLogModules_{~0ull}; Logger::Logger(int severity, int line, const char* filePath, int errnum) : buf_(), out_(LIBKINETO_DBG_STREAM), errnum_(errnum) { switch (severity) { + case VERBOSE: + buf_ << "V:"; + break; case INFO: buf_ << "INFO:"; break; @@ -34,9 +41,6 @@ Logger::Logger(int severity, int line, const char* filePath, int errnum) case ERROR: buf_ << "ERROR:"; break; - case VERBOSE: - buf_ << "V:"; - break; default: buf_ << "???:"; break; @@ -45,19 +49,20 @@ Logger::Logger(int severity, int line, const char* filePath, int errnum) const auto tt = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); const char* file = strrchr(filePath, '/'); - std::tm tm; - buf_ << std::put_time(localtime_r(&tt, &tm), "%F %T") << " " << getpid() - << ":" << syscall(SYS_gettid) << " " << (file ? file + 1 : filePath) - << ":" << line << "] "; + buf_ << fmt::format("{:%Y-%m-%d %H:%M:%S}", fmt::localtime(tt)) << " " + << processId() << ":" << systemThreadId() << " " + << (file ? file + 1 : filePath) << ":" << line << "] "; } Logger::~Logger() { +#ifdef __linux__ if (errnum_ != 0) { thread_local char buf[1024]; buf_ << " : " << strerror_r(errnum_, buf, sizeof(buf)); } - buf_ << std::ends; - out_ << buf_.str() << std::endl; +#endif + buf_ << std::endl; + out_ << buf_.str(); } void Logger::setVerboseLogModules(const std::vector& modules) { @@ -71,6 +76,6 @@ void Logger::setVerboseLogModules(const std::vector& modules) { } } -} // namespace KINETO_NAMESPACE +} // namespace libkineto #endif // USE_GOOGLE_LOG diff --git a/libkineto/src/Logger.h b/libkineto/src/Logger.h index e980ecfd1..6329cb22e 100644 --- a/libkineto/src/Logger.h +++ b/libkineto/src/Logger.h @@ -35,12 +35,15 @@ #include #include -namespace KINETO_NAMESPACE { +// unset a predefined ERROR (windows) +#undef ERROR -constexpr int VERBOSE = 0; -constexpr int INFO = 1; -constexpr int WARNING = 2; -constexpr int ERROR = 3; +#define VERBOSE 0 +#define INFO 1 +#define WARNING 2 +#define ERROR 3 + +namespace libkineto { class Logger { public: @@ -111,7 +114,7 @@ class VoidLogger { void operator&(std::ostream&) {} }; -} // namespace KINETO_NAMESPACE +} // namespace libkineto #ifdef LOG // Undefine in case these are already defined (quite likely) #undef LOG @@ -136,11 +139,11 @@ class VoidLogger { #endif #define LOG_IS_ON(severity) \ - (severity >= KINETO_NAMESPACE::Logger::severityLevel()) + (severity >= libkineto::Logger::severityLevel()) #define LOG_IF(severity, condition) \ - !(LOG_IS_ON(severity) && (condition)) ? (void)0 : KINETO_NAMESPACE::VoidLogger() & \ - KINETO_NAMESPACE::Logger(severity, __LINE__, __FILE__).stream() + !(LOG_IS_ON(severity) && (condition)) ? (void)0 : libkineto::VoidLogger() & \ + libkineto::Logger(severity, __LINE__, __FILE__).stream() #define LOG(severity) LOG_IF(severity, true) @@ -160,11 +163,11 @@ struct __to_constant__ { static const uint64_t val = n; }; #define FILENAME_HASH \ - __to_constant__::val + __to_constant__::val #define VLOG_IS_ON(verbosity) \ - (KINETO_NAMESPACE::Logger::verboseLogLevel() >= verbosity && \ - (KINETO_NAMESPACE::Logger::verboseLogModules() & FILENAME_HASH) == FILENAME_HASH) + (libkineto::Logger::verboseLogLevel() >= verbosity && \ + (libkineto::Logger::verboseLogModules() & FILENAME_HASH) == FILENAME_HASH) #define VLOG_IF(verbosity, condition) \ LOG_IF(VERBOSE, VLOG_IS_ON(verbosity) && (condition)) @@ -177,13 +180,13 @@ struct __to_constant__ { << "(x" << LOG_OCCURRENCES << ") " #define PLOG(severity) \ - KINETO_NAMESPACE::Logger(severity, __LINE__, __FILE__, errno).stream() + libkineto::Logger(severity, __LINE__, __FILE__, errno).stream() #define SET_LOG_SEVERITY_LEVEL(level) \ - KINETO_NAMESPACE::Logger::setSeverityLevel(level) + libkineto::Logger::setSeverityLevel(level) #define SET_LOG_VERBOSITY_LEVEL(level, modules) \ - KINETO_NAMESPACE::Logger::setVerboseLogLevel(level); \ - KINETO_NAMESPACE::Logger::setVerboseLogModules(modules) + libkineto::Logger::setVerboseLogLevel(level); \ + libkineto::Logger::setVerboseLogModules(modules) #endif // USE_GOOGLE_LOG diff --git a/libkineto/src/ProcessInfo.cpp b/libkineto/src/ProcessInfo.cpp deleted file mode 100644 index b24affb9c..000000000 --- a/libkineto/src/ProcessInfo.cpp +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include "ProcessInfo.h" - -#include -#include -#include -#include - -#include "Logger.h" - -static const std::string kChronosJobIDEnvVar = "CHRONOS_JOB_INSTANCE_ID"; - -namespace KINETO_NAMESPACE { - -// Max number of parent pids to collect, just for extra safeguarding. -constexpr int kMaxParentPids = 10; - -// Return a pair of -static std::pair parentPidAndCommand(pid_t pid) { - FILE* statfile = fopen(fmt::format("/proc/{}/stat", pid).c_str(), "r"); - if (statfile == nullptr) { - return std::make_pair(0, ""); - } - pid_t parent_pid; - char* command = nullptr; - int scanned = fscanf(statfile, "%*d (%m[^)]) %*c %d", &command, &parent_pid); - fclose(statfile); - VLOG(2) << " Current PID: " << pid << " Command: " << command - << " Parent PID: " << parent_pid; - std::pair ret; - if (scanned == 2) { - ret = std::make_pair(parent_pid, std::string(command)); - } else { - LOG(ERROR) << "Failed to parse /proc/" << pid << "/stat"; - ret = std::make_pair(0, ""); - } - - // The 'm' character in the format tells fscanf to allocate memory - // for the parsed string, which we need to free here. - free(command); - return ret; -} - -std::vector> pidCommandPairsOfAncestors() { - std::vector> pairs; - pairs.reserve(kMaxParentPids + 1); - pid_t curr_pid = getpid(); - for (int i = 0; i <= kMaxParentPids && curr_pid > 1; i++) { - std::pair ppid_and_comm = parentPidAndCommand(curr_pid); - pairs.push_back(std::make_pair(curr_pid, ppid_and_comm.second)); - curr_pid = ppid_and_comm.first; - } - return pairs; -} - -} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/ProcessInfo.h b/libkineto/src/ProcessInfo.h deleted file mode 100644 index b618f841e..000000000 --- a/libkineto/src/ProcessInfo.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include -#include -#include -#include - -namespace KINETO_NAMESPACE { - -struct ProcessInfo { - pid_t pid; - const std::string name; - const std::string label; -}; - -struct ThreadInfo { - ThreadInfo(int64_t tid, const std::string name) : - tid(tid), name(name) {} - int32_t tid; - const std::string name; -}; - - -// Return a list of pids and process names for the current process -// and its parents. -std::vector> pidCommandPairsOfAncestors(); - -} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/ThreadName.cpp b/libkineto/src/ThreadName.cpp deleted file mode 100644 index c6b905df2..000000000 --- a/libkineto/src/ThreadName.cpp +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#include -#include -#include -#include -#include - -#include "Logger.h" - -namespace KINETO_NAMESPACE { - -static constexpr size_t kMaxThreadNameLength = 16; - -bool setThreadName(const std::string& name) { - return 0 == pthread_setname_np(pthread_self(), name.c_str()); -} - -std::string getThreadName(pid_t tid) { - char buf[kMaxThreadNameLength] = "Unknown"; - std::string filename = fmt::format("/proc/{}/task/{}/comm", getpid(), tid); - FILE* comm_file = fopen(filename.c_str(), "r"); - if (comm_file) { - size_t len = fread(buf, 1, kMaxThreadNameLength, comm_file); - fclose(comm_file); - // Remove newline - if (len > 0) { - buf[len - 1] = '\0'; - } - } else { - LOG(WARNING) << "Failed to open " << filename; - } - return buf; -} - -} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/ThreadName.h b/libkineto/src/ThreadName.h deleted file mode 100644 index 35f6aa259..000000000 --- a/libkineto/src/ThreadName.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (c) Facebook, Inc. and its affiliates. - * All rights reserved. - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -#pragma once - -#include - -#include - -namespace KINETO_NAMESPACE { - -bool setThreadName(const std::string& name); -std::string getThreadName(pid_t tid); - -} // namespace KINETO_NAMESPACE diff --git a/libkineto/src/ThreadUtil.cpp b/libkineto/src/ThreadUtil.cpp new file mode 100644 index 000000000..90d869bd9 --- /dev/null +++ b/libkineto/src/ThreadUtil.cpp @@ -0,0 +1,199 @@ +#include "ThreadUtil.h" + +#ifndef _MSC_VER +#include +#include +#include +#include +#else // _MSC_VER +#include +#include +#define WIN32_LEAN_AND_MEAN +#define NOGDI +#include +#include +#undef ERROR +#endif // _MSC_VER + +#ifdef __ANDROID__ +#include +#endif + +#include +#include +#include + +namespace libkineto { + +namespace { +thread_local int32_t _pid = 0; +thread_local int32_t _tid = 0; +thread_local int32_t _sysTid = 0; +} + +int32_t processId() { + if (!_pid) { +#ifndef _MSC_VER + _pid = (int32_t)getpid(); +#else + _pid = (int32_t)GetCurrentProcessId(); +#endif + } + return _pid; +} + +int32_t systemThreadId() { + if (!_sysTid) { +#ifdef __APPLE__ + _sysTid = (int32_t)syscall(SYS_thread_selfid); +#elif defined _MSC_VER + _sysTid = (int32_t)GetCurrentThreadId(); +#else + _sysTid = (int32_t)syscall(SYS_gettid); +#endif + } + return _sysTid; +} + +int32_t threadId() { + if (!_tid) { +#ifndef _MSC_VER + pthread_t pth = pthread_self(); + int32_t* ptr = reinterpret_cast(&pth); + _tid = *ptr; +#else + _tid = (int32_t)GetCurrentThreadId(); +#endif + } + return _tid; +} + +namespace { +static constexpr size_t kMaxThreadNameLength = 16; + +static constexpr const char* basename(const char* s, int off = 0) { + return !s[off] + ? s + : s[off] == '/' ? basename(&s[off + 1]) : basename(s, off + 1); +} +#if defined(_MSC_VER) +void *getKernel32Func(const char* procName) { + return GetProcAddress(GetModuleHandleA("KERNEL32.DLL"), procName); +} +#endif +} + +bool setThreadName(const std::string& name) { +#ifdef __APPLE__ + return 0 == pthread_setname_np(name.c_str()); +#elif defined _MSC_VER + // Per https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-setthreaddescription + // Use runtime linking to set thread description + static auto _SetThreadDescription = reinterpret_cast(getKernel32Func("SetThreadDescription")); + if (!_SetThreadDescription) { + return false; + } + std::wstring_convert> conv; + std::wstring wname = conv.from_bytes(name); + HRESULT hr = _SetThreadDescription(GetCurrentThread(), wname.c_str()); + return SUCCEEDED(hr); +#else + return 0 == pthread_setname_np(pthread_self(), name.c_str()); +#endif +} + +std::string getThreadName() { +#ifndef _MSC_VER + char buf[kMaxThreadNameLength] = ""; + if ( +#ifndef __ANDROID__ + pthread_getname_np(pthread_self(), buf, kMaxThreadNameLength) != 0 +#else + prctl(PR_GET_NAME, buf, kMaxThreadNameLength) != 0 +#endif + ) { + return "Unknown"; + } + return buf; +#else // _MSC_VER + static auto _GetThreadDescription = reinterpret_cast(getKernel32Func("GetThreadDescription")); + if (!_GetThreadDescription) { + return "Unknown"; + } + PWSTR data; + HRESULT hr = _GetThreadDescription(GetCurrentThread(), &data); + if (!SUCCEEDED(hr)) { + return ""; + } + std::wstring_convert> conv; + std::string name = conv.to_bytes(data); + LocalFree(data); + return name; +#endif +} + +// Linux: +// Extract process name from /proc/pid/cmdline. This does not have +// the 16 character limit that /proc/pid/status and /prod/pid/comm has. +std::string processName(int32_t pid) { +#ifdef __linux__ + FILE* cmdfile = fopen(fmt::format("/proc/{}/cmdline", pid).c_str(), "r"); + if (cmdfile != nullptr) { + char* command = nullptr; + int scanned = fscanf(cmdfile, "%ms", &command); + fclose(cmdfile); + if (scanned > 0 && command) { + std::string ret(basename(command)); + free(command); + return ret; + } + } + std::cerr << "Failed to read process name for pid " << pid << std::endl; +#endif + return ""; +} + +// Max number of parent pids to collect, just for extra safeguarding. +constexpr int kMaxParentPids = 10; + +// Return a pair of +static std::pair parentPidAndCommand(int32_t pid) { +#ifdef __linux__ + FILE* statfile = fopen(fmt::format("/proc/{}/stat", pid).c_str(), "r"); + if (statfile == nullptr) { + return std::make_pair(0, ""); + } + int32_t parent_pid; + char* command = nullptr; + int scanned = fscanf(statfile, "%*d (%m[^)]) %*c %d", &command, &parent_pid); + fclose(statfile); + std::pair ret; + if (scanned == 2) { + ret = std::make_pair(parent_pid, std::string(command)); + } else { + std::cerr << "Failed to parse /proc/" << pid << "/stat" << std::endl; + ret = std::make_pair(0, ""); + } + + // The 'm' character in the format tells fscanf to allocate memory + // for the parsed string, which we need to free here. + free(command); + return ret; +#else + return std::make_pair(0, ""); +#endif +} + +std::vector> pidCommandPairsOfAncestors() { + std::vector> pairs; + pairs.reserve(kMaxParentPids + 1); + int32_t curr_pid = processId(); + for (int i = 0; i <= kMaxParentPids && curr_pid > 1; i++) { + std::pair ppid_and_comm = parentPidAndCommand(curr_pid); + pairs.push_back(std::make_pair(curr_pid, ppid_and_comm.second)); + curr_pid = ppid_and_comm.first; + } + return pairs; +} + +} // namespace libkineto diff --git a/libkineto/src/init.cpp b/libkineto/src/init.cpp index c17d9a727..d12e45794 100644 --- a/libkineto/src/init.cpp +++ b/libkineto/src/init.cpp @@ -25,7 +25,6 @@ * mechanism), but allows the application to continue. */ -#include #include #include diff --git a/libkineto/src/libkineto_api.cpp b/libkineto/src/libkineto_api.cpp index 0a8f4394b..33c5b6e41 100644 --- a/libkineto/src/libkineto_api.cpp +++ b/libkineto/src/libkineto_api.cpp @@ -7,6 +7,8 @@ #include "libkineto.h" +#include "ThreadUtil.h" + namespace libkineto { LibkinetoApi& api() { @@ -16,12 +18,12 @@ LibkinetoApi& api() { void LibkinetoApi::initClientIfRegistered() { if (client_) { - if (clientRegisterThread_ != pthread_self()) { + if (clientRegisterThread_ != threadId()) { fprintf( stderr, "ERROR: External init callback must run in same thread as registerClient " "(%d != %d)\n", - (int)pthread_self(), + threadId(), (int)clientRegisterThread_); } else { client_->init(); @@ -37,7 +39,7 @@ void LibkinetoApi::registerClient(ClientInterface* client) { } // Assume here that the external init callback is *not* threadsafe // and only call it if it's the same thread that called registerClient - clientRegisterThread_ = pthread_self(); + clientRegisterThread_ = threadId(); } } // namespace libkineto diff --git a/libkineto/src/output_base.h b/libkineto/src/output_base.h index 14775b5b8..050a0828b 100644 --- a/libkineto/src/output_base.h +++ b/libkineto/src/output_base.h @@ -18,15 +18,14 @@ #include "CuptiActivity.h" #endif // HAS_CUPTI #include "ActivityBuffers.h" -#include "ClientTraceActivity.h" #include "GenericTraceActivity.h" -#include "ProcessInfo.h" +#include "ThreadUtil.h" #include "TraceSpan.h" namespace KINETO_NAMESPACE { class Config; - class RuntimeActivity; class GpuKernelActivity; + struct RuntimeActivity; } namespace libkineto { @@ -45,14 +44,8 @@ class ActivityLogger { virtual void handleTraceSpan(const TraceSpan& span) = 0; - virtual void handleIterationStart(const TraceSpan& span) = 0; - - virtual void handleCpuActivity( - const libkineto::ClientTraceActivity& activity, - const TraceSpan& span) = 0; - virtual void handleGenericActivity( - const GenericTraceActivity& activity) = 0; + const libkineto::GenericTraceActivity& activity) = 0; #ifdef HAS_CUPTI virtual void handleRuntimeActivity(const RuntimeActivity& activity) = 0; @@ -67,6 +60,13 @@ class ActivityLogger { const GpuActivity& activity) = 0; #endif // HAS_CUPTI + virtual void handleTraceStart( + const std::unordered_map& metadata) = 0; + + void handleTraceStart() { + handleTraceStart(std::unordered_map()); + } + virtual void finalizeTrace( const KINETO_NAMESPACE::Config& config, std::unique_ptr buffers, diff --git a/libkineto/src/output_csv.cpp b/libkineto/src/output_csv.cpp index a12b591fc..67868cf4b 100644 --- a/libkineto/src/output_csv.cpp +++ b/libkineto/src/output_csv.cpp @@ -11,8 +11,10 @@ #include #include -#include "Config.h" +#include +#include +#include "Config.h" #include "Logger.h" namespace KINETO_NAMESPACE { @@ -44,12 +46,11 @@ void EventCSVLogger::handleSample(int device, const Sample& sample, bool from_ne if (out_) { auto now = system_clock::now(); auto time = system_clock::to_time_t(now); - struct tm tm; for (const Stat& s : sample.stats) { if (eventNames_.find(s.name) == eventNames_.end()) { continue; } - *out_ << std::put_time(localtime_r(&time, &tm), "%F %T") << ","; + *out_ << fmt::format("{:%Y-%m-%d %H:%M:%S}", fmt::localtime(time)) << ","; *out_ << sample.deltaMsec << ","; *out_ << device << ","; *out_ << s.name; diff --git a/libkineto/src/output_json.cpp b/libkineto/src/output_json.cpp index 54eff581a..76f0bca0d 100644 --- a/libkineto/src/output_json.cpp +++ b/libkineto/src/output_json.cpp @@ -11,13 +11,13 @@ #include #include #include -#include #include "Config.h" #ifdef HAS_CUPTI #include "CuptiActivity.h" #include "CuptiActivity.tpp" #include "CuptiActivityInterface.h" +#include "CudaDeviceProperties.h" #endif // HAS_CUPTI #include "Demangle.h" #include "TraceSpan.h" @@ -30,32 +30,51 @@ using namespace libkineto; namespace KINETO_NAMESPACE { static constexpr int kSchemaVersion = 1; +#ifdef __linux__ +static const std::string kDefaultLogFileFmt = + "/tmp/libkineto_activities_{}.json"; +#else +static const std::string kDefaultLogFileFmt = "libkineto_activities_{}.json"; +#endif -static void writeHeader(std::ofstream& stream) { - stream << fmt::format(R"JSON( +void ChromeTraceLogger::handleTraceStart( + const std::unordered_map& metadata) { + traceOf_ << fmt::format(R"JSON( {{ - "schemaVersion": {}, - "traceEvents": [ - )JSON", kSchemaVersion); + "schemaVersion": {},)JSON", kSchemaVersion); + + for (const auto& kv : metadata) { + traceOf_ << fmt::format(R"JSON( + "{}": {},)JSON", kv.first, kv.second); + } + +#ifdef HAS_CUPTI + traceOf_ << fmt::format(R"JSON( + "deviceProperties": [{} + ],)JSON", devicePropertiesJson()); +#endif + + traceOf_ << R"JSON( + "traceEvents": [)JSON"; } -static void openTraceFile(std::string& name, std::ofstream& stream) { - stream.open(name, std::ofstream::out | std::ofstream::trunc); - if (!stream) { - PLOG(ERROR) << "Failed to open '" << name << "'"; +static std::string defaultFileName() { + return fmt::format(kDefaultLogFileFmt, processId()); +} + +void ChromeTraceLogger::openTraceFile() { + traceOf_.open(fileName_, std::ofstream::out | std::ofstream::trunc); + if (!traceOf_) { + PLOG(ERROR) << "Failed to open '" << fileName_ << "'"; } else { - LOG(INFO) << "Tracing to " << name; - writeHeader(stream); + LOG(INFO) << "Tracing to " << fileName_; } } -ChromeTraceLogger::ChromeTraceLogger(const std::string& traceFileName, int smCount) - : fileName_(traceFileName), pid_(getpid()) { +ChromeTraceLogger::ChromeTraceLogger(const std::string& traceFileName) { + fileName_ = traceFileName.empty() ? defaultFileName() : traceFileName; traceOf_.clear(std::ios_base::badbit); - openTraceFile(fileName_, traceOf_); -#ifdef HAS_CUPTI - smCount_ = CuptiActivityInterface::singleton().smCount(); -#endif + openTraceFile(); } static int64_t us(int64_t timestamp) { @@ -111,7 +130,7 @@ void ChromeTraceLogger::handleThreadInfo( "name": "thread {} ({})" }} }},)JSON", - time, pid_, threadInfo.tid, + time, processId(), threadInfo.tid, threadInfo.tid, threadInfo.name); // clang-format on } @@ -136,9 +155,13 @@ void ChromeTraceLogger::handleTraceSpan(const TraceSpan& span) { span.prefix, span.name, span.iteration, span.opCount); // clang-format on + + if (span.tracked) { + addIterationMarker(span); + } } -void ChromeTraceLogger::handleIterationStart(const TraceSpan& span) { +void ChromeTraceLogger::addIterationMarker(const TraceSpan& span) { if (!traceOf_) { return; } @@ -154,65 +177,72 @@ void ChromeTraceLogger::handleIterationStart(const TraceSpan& span) { // clang-format on } -static std::string traceActivityJson(const TraceActivity& activity, std::string tidPrefix) { +static std::string traceActivityJson( + const TraceActivity& activity, std::string tid) { // clang-format off return fmt::format(R"JSON( - "name": "{}", "pid": {}, "tid": "{}{}", + "name": "{}", "pid": {}, "tid": "{}", "ts": {}, "dur": {})JSON", - activity.name(), activity.deviceId(), tidPrefix, (uint32_t)activity.resourceId(), + activity.name(), activity.deviceId(), tid, activity.timestamp(), activity.duration()); // clang-format on } -void ChromeTraceLogger::handleCpuActivity( - const libkineto::ClientTraceActivity& op, - const TraceSpan& span) { +void ChromeTraceLogger::handleGenericInstantEvent( + const libkineto::GenericTraceActivity& op) { if (!traceOf_) { return; } - // clang-format off traceOf_ << fmt::format(R"JSON( {{ - "ph": "X", "cat": "Operator", {}, + "ph": "i", "s": "t", "name": "{}", + "pid": {}, "tid": {}, + "ts": {}, "args": {{ - "Input dims": {}, "Input type": {}, "Input names": {}, - "Output dims": {}, "Output type": {}, "Output names": {}, - "Device": {}, "External id": {}, "Extra arguments": {}, - "Call stack": "{}", - "Trace name": "{}", "Trace iteration": {} + {} }} }},)JSON", - traceActivityJson(op, ""), - // args - op.inputDims, op.inputTypes, op.inputNames, - op.outputDims, op.outputTypes, op.outputNames, - op.device, op.correlation, op.arguments, - op.callStack, - span.name, span.iteration); - // clang-format on + op.name(), op.deviceId(), op.resourceId(), + op.timestamp(), op.getMetadata()); } void ChromeTraceLogger::handleGenericActivity( - const GenericTraceActivity& op) { - if (!traceOf_) { + const libkineto::GenericTraceActivity& op) { + if (!traceOf_) { return; } - // FIXME: Make cat and tid customizable + if (op.activityType == ActivityType::CPU_INSTANT_EVENT) { + handleGenericInstantEvent(op); + return; + } + + auto op_metadata = op.getMetadata(); + std::string separator = ""; + if (op_metadata.find_first_not_of(" \t\n") != std::string::npos) { + separator = ","; + } + const std::string tid = + op.type() == ActivityType::GPU_USER_ANNOTATION ? + fmt::format("stream {} annotations", op.resourceId()) : + fmt::format("{}", op.resourceId()); + // clang-format off traceOf_ << fmt::format(R"JSON( {{ - "ph": "X", "cat": "User", "name": "{}", - "pid": {}, "tid": "stream {} user", - "ts": {}, "dur": {}, + "ph": "X", "cat": "{}", {}, "args": {{ - "External id": {} + "External id": {}, + "Trace name": "{}", "Trace iteration": {}{} + {} }} }},)JSON", - op.name(), op.deviceId(), op.resourceId(), - op.timestamp(), op.duration(), - op.correlationId()); + toString(op.type()), traceActivityJson(op, tid), + // args + op.id, + op.traceSpan()->name, op.traceSpan()->iteration, separator, + op_metadata); // clang-format on } @@ -228,7 +258,7 @@ void ChromeTraceLogger::handleLinkStart(const RuntimeActivity& s) { "ph": "s", "id": {}, "pid": {}, "tid": {}, "ts": {}, "cat": "async", "name": "launch" }},)JSON", - s.correlationId(), pid_, s.resourceId(), s.timestamp()); + s.correlationId(), processId(), s.resourceId(), s.timestamp()); // clang-format on } @@ -264,7 +294,7 @@ void ChromeTraceLogger::handleRuntimeActivity( "external id": {}, "external ts": {} }} }},)JSON", - traceActivityJson(activity, ""), + traceActivityJson(activity, fmt::format("{}", activity.resourceId())), // args cbid, activity.raw().correlationId, ext.correlationId(), ext.timestamp()); @@ -283,6 +313,10 @@ void ChromeTraceLogger::handleRuntimeActivity( } } +static std::string streamName(const TraceActivity& act) { + return fmt::format("stream {}", act.resourceId()); +} + // GPU side kernel activity void ChromeTraceLogger::handleGpuActivity( const GpuActivity& activity) { @@ -292,12 +326,28 @@ void ChromeTraceLogger::handleGpuActivity( const CUpti_ActivityKernel4* kernel = &activity.raw(); const TraceActivity& ext = *activity.linkedActivity(); constexpr int threads_per_warp = 32; + float blocks_per_sm = -1.0; float warps_per_sm = -1.0; - if (smCount_) { - warps_per_sm = (kernel->gridX * kernel->gridY * kernel->gridZ) * - (kernel->blockX * kernel->blockY * kernel->blockZ) / - (float) threads_per_warp / smCount_; + int sm_count = smCount(kernel->deviceId); + if (sm_count) { + blocks_per_sm = + (kernel->gridX * kernel->gridY * kernel->gridZ) / (float) sm_count; + warps_per_sm = + blocks_per_sm * (kernel->blockX * kernel->blockY * kernel->blockZ) + / threads_per_warp; } + + // Calculate occupancy + float occupancy = KINETO_NAMESPACE::kernelOccupancy( + kernel->deviceId, + kernel->registersPerThread, + kernel->staticSharedMemory, + kernel->dynamicSharedMemory, + kernel->blockX, + kernel->blockY, + kernel->blockZ, + blocks_per_sm); + // clang-format off traceOf_ << fmt::format(R"JSON( {{ @@ -307,20 +357,24 @@ void ChromeTraceLogger::handleGpuActivity( "stream": {}, "correlation": {}, "external id": {}, "registers per thread": {}, "shared memory": {}, + "blocks per SM": {}, "warps per SM": {}, "grid": [{}, {}, {}], - "block": [{}, {}, {}] + "block": [{}, {}, {}], + "est. achieved occupancy %": {} }} }},)JSON", - traceActivityJson(activity, "stream "), + traceActivityJson(activity, streamName(activity)), // args us(kernel->queued), kernel->deviceId, kernel->contextId, kernel->streamId, kernel->correlationId, ext.correlationId(), kernel->registersPerThread, kernel->staticSharedMemory + kernel->dynamicSharedMemory, + blocks_per_sm, warps_per_sm, kernel->gridX, kernel->gridY, kernel->gridZ, - kernel->blockX, kernel->blockY, kernel->blockZ); + kernel->blockX, kernel->blockY, kernel->blockZ, + (int) (0.5 + occupancy * 100.0)); // clang-format on handleLinkEnd(activity); @@ -349,7 +403,7 @@ void ChromeTraceLogger::handleGpuActivity( "bytes": {}, "memory bandwidth (GB/s)": {} }} }},)JSON", - traceActivityJson(activity, "stream "), + traceActivityJson(activity, streamName(activity)), // args memcpy.deviceId, memcpy.contextId, memcpy.streamId, memcpy.correlationId, ext.correlationId(), @@ -378,7 +432,7 @@ void ChromeTraceLogger::handleGpuActivity( "bytes": {}, "memory bandwidth (GB/s)": {} }} }},)JSON", - traceActivityJson(activity, "stream "), + traceActivityJson(activity, streamName(activity)), // args memcpy.srcDeviceId, memcpy.deviceId, memcpy.dstDeviceId, memcpy.srcContextId, memcpy.contextId, memcpy.dstContextId, @@ -406,7 +460,7 @@ void ChromeTraceLogger::handleGpuActivity( "bytes": {}, "memory bandwidth (GB/s)": {} }} }},)JSON", - traceActivityJson(activity, "stream "), + traceActivityJson(activity, streamName(activity)), // args memset.deviceId, memset.contextId, memset.streamId, memset.correlationId, ext.correlationId(), diff --git a/libkineto/src/output_json.h b/libkineto/src/output_json.h index 84ac673c9..a0f2e6ae0 100644 --- a/libkineto/src/output_json.h +++ b/libkineto/src/output_json.h @@ -16,7 +16,7 @@ #ifdef HAS_CUPTI #include #endif -#include "ClientTraceActivity.h" +#include "GenericTraceActivity.h" #include "output_base.h" namespace libkineto { @@ -29,7 +29,7 @@ class Config; class ChromeTraceLogger : public libkineto::ActivityLogger { public: - explicit ChromeTraceLogger(const std::string& traceFileName, int smCount); + explicit ChromeTraceLogger(const std::string& traceFileName); // Note: the caller of these functions should handle concurrency // i.e., we these functions are not thread-safe @@ -41,14 +41,7 @@ class ChromeTraceLogger : public libkineto::ActivityLogger { void handleTraceSpan(const TraceSpan& span) override; - void handleIterationStart(const TraceSpan& span) override; - - void handleCpuActivity( - const libkineto::ClientTraceActivity& activity, - const TraceSpan& span) override; - - void handleGenericActivity( - const GenericTraceActivity& activity) override; + void handleGenericActivity(const GenericTraceActivity& activity) override; #ifdef HAS_CUPTI void handleRuntimeActivity( @@ -60,11 +53,18 @@ class ChromeTraceLogger : public libkineto::ActivityLogger { void handleGpuActivity(const GpuActivity& activity) override; #endif // HAS_CUPTI + void handleTraceStart( + const std::unordered_map& metadata) override; + void finalizeTrace( const Config& config, std::unique_ptr buffers, int64_t endTime) override; + std::string traceFileName() const { + return fileName_; + } + private: #ifdef HAS_CUPTI @@ -73,16 +73,14 @@ class ChromeTraceLogger : public libkineto::ActivityLogger { void handleLinkEnd(const TraceActivity& e); #endif // HAS_CUPTI - std::string fileName_; - std::ofstream traceOf_; + void addIterationMarker(const TraceSpan& span); - // Cache pid to avoid repeated calls to getpid() - pid_t pid_; + void openTraceFile(); -#ifdef HAS_CUPTI - // Number of SMs on current device - int smCount_{0}; -#endif + void handleGenericInstantEvent(const GenericTraceActivity& op); + + std::string fileName_; + std::ofstream traceOf_; }; } // namespace KINETO_NAMESPACE diff --git a/libkineto/src/output_membuf.h b/libkineto/src/output_membuf.h index 3e3c76e64..5c7ed36cd 100644 --- a/libkineto/src/output_membuf.h +++ b/libkineto/src/output_membuf.h @@ -17,7 +17,7 @@ #endif #include "Config.h" -#include "ClientTraceActivity.h" +#include "GenericTraceActivity.h" #ifdef HAS_CUPTI #include "CuptiActivity.h" #include "CuptiActivity.tpp" @@ -47,22 +47,10 @@ class MemoryTraceLogger : public ActivityLogger { } void handleTraceSpan(const TraceSpan& span) override { - traceSpanList_.push_back(span); + // Handled separately } - void handleIterationStart(const TraceSpan& span) override { - iterationList_.push_back(span); - } - - void handleCpuActivity( - const libkineto::ClientTraceActivity& activity, - const TraceSpan& span) override { - activities_.push_back( - std::make_unique(activity, span)); - } - - void handleGenericActivity( - const GenericTraceActivity& activity) override { + void handleGenericActivity(const GenericTraceActivity& activity) override { activities_.push_back( std::make_unique(activity)); } @@ -87,6 +75,11 @@ class MemoryTraceLogger : public ActivityLogger { } #endif // HAS_CUPTI + void handleTraceStart( + const std::unordered_map& metadata) override { + metadata_ = metadata; + } + void finalizeTrace( const Config& config, std::unique_ptr buffers, @@ -100,6 +93,7 @@ class MemoryTraceLogger : public ActivityLogger { } void log(ActivityLogger& logger) { + logger.handleTraceStart(metadata_); for (auto& activity : activities_) { activity->log(logger); } @@ -109,11 +103,8 @@ class MemoryTraceLogger : public ActivityLogger { for (auto& p : threadInfoList_) { logger.handleThreadInfo(p.first, p.second); } - for (auto& span : traceSpanList_) { - logger.handleTraceSpan(span); - } - for (auto& it : iterationList_) { - logger.handleIterationStart(it); + for (auto& cpu_trace_buffer : buffers_->cpu) { + logger.handleTraceSpan(cpu_trace_buffer->span); } // Hold on to the buffers logger.finalizeTrace(*config_, nullptr, endTime_); @@ -121,36 +112,13 @@ class MemoryTraceLogger : public ActivityLogger { private: - struct CpuActivityDecorator : public libkineto::TraceActivity { - CpuActivityDecorator( - const libkineto::ClientTraceActivity& activity, - const TraceSpan& span) - : wrappee_(activity), span_(span) {} - int64_t deviceId() const override {return wrappee_.deviceId();} - int64_t resourceId() const override {return wrappee_.resourceId();} - int64_t timestamp() const override {return wrappee_.timestamp();} - int64_t duration() const override {return wrappee_.duration();} - int64_t correlationId() const override {return wrappee_.correlationId();} - ActivityType type() const override {return wrappee_.type();} - const std::string name() const override {return wrappee_.name();} - const TraceActivity* linkedActivity() const override { - return wrappee_.linkedActivity(); - } - void log(ActivityLogger& logger) const override { - logger.handleCpuActivity(wrappee_, span_); - } - const libkineto::ClientTraceActivity& wrappee_; - const TraceSpan span_; - }; - std::unique_ptr config_; // Optimization: Remove unique_ptr by keeping separate vector per type std::vector> activities_; std::vector> processInfoList_; std::vector> threadInfoList_; - std::vector traceSpanList_; - std::vector iterationList_; std::unique_ptr buffers_; + std::unordered_map metadata_; int64_t endTime_{0}; }; diff --git a/libkineto/test/ActivityProfilerTest.cpp b/libkineto/test/ActivityProfilerTest.cpp index b99d4ef0f..053c4e610 100644 --- a/libkineto/test/ActivityProfilerTest.cpp +++ b/libkineto/test/ActivityProfilerTest.cpp @@ -5,16 +5,19 @@ * LICENSE file in the root directory of this source tree. */ -#include #include #include #include #include -#include -#include #include #include +#ifdef __linux__ +#include +#include +#include +#endif + #include "include/libkineto.h" #include "src/ActivityProfiler.h" #include "src/ActivityTrace.h" @@ -25,6 +28,7 @@ #include "src/output_membuf.h" #include "src/Logger.h" +#include "test/MockActivitySubProfiler.h" using namespace std::chrono; using namespace KINETO_NAMESPACE; @@ -32,22 +36,26 @@ using namespace KINETO_NAMESPACE; #define CUDA_LAUNCH_KERNEL CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 #define CUDA_MEMCPY CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020 +namespace { +const TraceSpan& defaultTraceSpan() { + static TraceSpan span(0, 0, "Unknown", ""); + return span; +} +} + // Provides ability to easily create a few test CPU-side ops struct MockCpuActivityBuffer : public CpuTraceBuffer { MockCpuActivityBuffer(int64_t startTime, int64_t endTime) { - span = {startTime, endTime, 0, 1, "Test trace", ""}; + span = TraceSpan(startTime, endTime,"Test trace"); gpuOpCount = 0; } void addOp(std::string name, int64_t startTime, int64_t endTime, int64_t correlation) { - ClientTraceActivity op; - op.opType = name; + GenericTraceActivity op(span, ActivityType::CPU_OP, name); op.startTime = startTime; op.endTime = endTime; - op.device = 0; - op.pthreadId = pthread_self(); - op.sysThreadId = 123; - op.correlation = correlation; + op.resource = systemThreadId(); + op.id = correlation; activities.push_back(std::move(op)); span.opCount++; } @@ -71,7 +79,7 @@ struct MockCuptiActivityBuffer { start_us, end_us, correlation); act.kind = CUPTI_ACTIVITY_KIND_RUNTIME; act.cbid = cbid; - act.threadId = pthread_self(); + act.threadId = threadId(); activities.push_back(reinterpret_cast(&act)); } @@ -129,7 +137,7 @@ class MockCuptiActivities : public CuptiActivityInterface { } virtual const std::pair processActivities( - std::list& /*unused*/, + CuptiActivityBufferMap&, /*unused*/ std::function handler) override { for (CUpti_Activity* act : activityBuffer->activities) { handler(act); @@ -137,11 +145,13 @@ class MockCuptiActivities : public CuptiActivityInterface { return {activityBuffer->activities.size(), 100}; } - virtual std::unique_ptr> + virtual std::unique_ptr activityBuffers() override { - auto list = std::make_unique>(); - list->emplace_back(nullptr, 100); - return list; + auto map = std::make_unique(); + auto buf = std::make_unique(100); + uint8_t* addr = buf->data(); + (*map)[addr] = std::move(buf); + return map; } void bufferRequestedOverride(uint8_t** buffer, size_t* size, size_t* maxNumRecords) { @@ -159,17 +169,16 @@ class ActivityProfilerTest : public ::testing::Test { profiler_ = std::make_unique( cuptiActivities_, /*cpu only*/ false); cfg_ = std::make_unique(); - } - - std::list createCuptiActivityBuffers() { - std::list res; - res.emplace_back(nullptr, 100); - return res; + cfg_->validate(); + loggerFactory.addProtocol("file", [](const std::string& url) { + return std::unique_ptr(new ChromeTraceLogger(url)); + }); } std::unique_ptr cfg_; MockCuptiActivities cuptiActivities_; std::unique_ptr profiler_; + ActivityLoggerFactory loggerFactory; }; @@ -194,7 +203,7 @@ TEST(ActivityProfiler, AsyncTrace) { EXPECT_TRUE(success); EXPECT_FALSE(profiler.isActive()); - auto logger = std::make_unique(cfg.activitiesLogFile(), 10); + auto logger = std::make_unique(cfg.activitiesLogFile()); auto now = system_clock::now(); profiler.configure(cfg, now); profiler.setLogger(logger.get()); @@ -222,6 +231,7 @@ TEST(ActivityProfiler, AsyncTrace) { // Assert that tracing has completed EXPECT_FALSE(profiler.isActive()); +#ifdef __linux__ // Check that the expected file was written and that it has some content int fd = open(filename, O_RDONLY); if (!fd) { @@ -232,6 +242,7 @@ TEST(ActivityProfiler, AsyncTrace) { struct stat buf{}; fstat(fd, &buf); EXPECT_GT(buf.st_size, 100); +#endif } @@ -253,6 +264,8 @@ TEST_F(ActivityProfilerTest, SyncTrace) { profiler.startTrace(start_time); profiler.stopTrace(start_time + microseconds(duration_us)); + profiler.recordThreadInfo(); + // Log some cpu ops auto cpuOps = std::make_unique( start_time_us, start_time_us + duration_us); @@ -279,7 +292,7 @@ TEST_F(ActivityProfilerTest, SyncTrace) { profiler_->reset(); // Wrapper that allows iterating over the activities - ActivityTrace trace(std::move(logger), cuptiActivities_); + ActivityTrace trace(std::move(logger), loggerFactory); EXPECT_EQ(trace.activities()->size(), 9); std::map activityCounts; std::map resourceIds; @@ -298,12 +311,14 @@ TEST_F(ActivityProfilerTest, SyncTrace) { EXPECT_EQ(activityCounts["kernel"], 2); EXPECT_EQ(activityCounts["Memcpy HtoD (Pinned -> Device)"], 1); - // Ops and runtime events are on thread 123 - EXPECT_EQ(resourceIds[123], 6); + auto sysTid = systemThreadId(); + // Ops and runtime events are on thread sysTid + EXPECT_EQ(resourceIds[sysTid], 6); // Kernels are on stream 1, memcpy on stream 2 EXPECT_EQ(resourceIds[1], 2); EXPECT_EQ(resourceIds[2], 1); +#ifdef __linux__ char filename[] = "/tmp/libkineto_testXXXXXX.json"; mkstemps(filename, 5); trace.save(filename); @@ -317,6 +332,7 @@ TEST_F(ActivityProfilerTest, SyncTrace) { struct stat buf{}; fstat(fd, &buf); EXPECT_GT(buf.st_size, 100); +#endif } TEST_F(ActivityProfilerTest, CorrelatedTimestampTest) { @@ -338,6 +354,8 @@ TEST_F(ActivityProfilerTest, CorrelatedTimestampTest) { // When launching kernel, the CPU event should always precede the GPU event. int64_t kernelLaunchTime = 120; + profiler.recordThreadInfo(); + // set up CPU event auto cpuOps = std::make_unique( start_time_us, start_time_us + duration_us); @@ -354,7 +372,7 @@ TEST_F(ActivityProfilerTest, CorrelatedTimestampTest) { auto logger = std::make_unique(*cfg_); profiler.processTrace(*logger); - ActivityTrace trace(std::move(logger), cuptiActivities_); + ActivityTrace trace(std::move(logger), loggerFactory); std::map counts; for (auto& activity : *trace.activities()) { counts[activity->name()]++; @@ -365,6 +383,78 @@ TEST_F(ActivityProfilerTest, CorrelatedTimestampTest) { EXPECT_EQ(counts["launchKernel"], 1); } +TEST_F(ActivityProfilerTest, SubActivityProfilers) { + using ::testing::Return; + using ::testing::ByMove; + + // Verbose logging is useful for debugging + std::vector log_modules( + {"ActivityProfiler.cpp"}); + SET_LOG_VERBOSITY_LEVEL(2, log_modules); + + // Setup example events to test + GenericTraceActivity ev{defaultTraceSpan(), ActivityType::GLOW_RUNTIME, ""}; + ev.device = 1; + ev.resource = 0; + + int64_t start_time_us = 100; + int64_t duration_us = 1000; + auto start_time = time_point(microseconds(start_time_us)); + + std::vector test_activities{3, ev}; + test_activities[0].startTime = start_time_us; + test_activities[0].endTime = start_time_us + 5000; + test_activities[0].activityName = "SubGraph A execution"; + test_activities[1].startTime = start_time_us; + test_activities[1].endTime = start_time_us + 2000; + test_activities[1].activityName = "Operator foo"; + test_activities[2].startTime = start_time_us + 2500; + test_activities[2].endTime = start_time_us + 2900; + test_activities[2].activityName = "Operator bar"; + + auto mock_activity_profiler = + std::make_shared(test_activities); + + MockCuptiActivities activities; + ActivityProfiler profiler(activities, /*cpu only*/ true); + profiler.addActivityProfiler(mock_activity_profiler); + + profiler.configure(*cfg_, start_time); + profiler.startTrace(start_time); + EXPECT_TRUE(profiler.isActive()); + + profiler.stopTrace(start_time + microseconds(duration_us)); + EXPECT_TRUE(profiler.isActive()); + + char filename[] = "/tmp/libkineto_testXXXXXX.json"; + mkstemps(filename, 5); + LOG(INFO) << "Logging to tmp file " << filename; + + // process trace + auto logger = std::make_unique(*cfg_); + profiler.processTrace(*logger); + profiler.setLogger(logger.get()); + + ActivityTrace trace(std::move(logger), loggerFactory); + trace.save(filename); + const auto& traced_activites = trace.activities(); + + // Test we have all the events + EXPECT_EQ(traced_activites->size(), test_activities.size()); + + // Check that the expected file was written and that it has some content + int fd = open(filename, O_RDONLY); + if (!fd) { + perror(filename); + } + EXPECT_TRUE(fd); + + // Should expect at least 100 bytes + struct stat buf{}; + fstat(fd, &buf); + EXPECT_GT(buf.st_size, 100); +} + TEST_F(ActivityProfilerTest, BufferSizeLimitTestWarmup) { ActivityProfiler profiler(cuptiActivities_, /*cpu only*/ false); @@ -383,9 +473,6 @@ TEST_F(ActivityProfilerTest, BufferSizeLimitTestWarmup) { size_t gpuBufferSize; size_t maxNumRecords; cuptiActivities_.bufferRequestedOverride(&buf, &gpuBufferSize, &maxNumRecords); - - // we don't actually do anything with the buf so just free it to prevent leaks in tests - free(buf); } profiler.performRunLoopStep(now, now); diff --git a/libkineto/test/ConfigTest.cpp b/libkineto/test/ConfigTest.cpp index e0c469b08..81f09cb25 100644 --- a/libkineto/test/ConfigTest.cpp +++ b/libkineto/test/ConfigTest.cpp @@ -72,12 +72,9 @@ TEST(ParseTest, Format) { TEST(ParseTest, DefaultActivityTypes) { Config cfg; cfg.validate(); + auto all_activities = activityTypes(); EXPECT_EQ(cfg.selectedActivityTypes(), - std::set({ActivityType::GPU_MEMCPY, - ActivityType::GPU_MEMSET, - ActivityType::CONCURRENT_KERNEL, - ActivityType::EXTERNAL_CORRELATION, - ActivityType::CUDA_RUNTIME})); + std::set(all_activities.begin(), all_activities.end())); } TEST(ParseTest, ActivityTypes) { @@ -87,14 +84,19 @@ TEST(ParseTest, ActivityTypes) { EXPECT_FALSE(cfg.parse("=ACTIVITY_TYPES=")); EXPECT_EQ(cfg.selectedActivityTypes(), - std::set({ActivityType::GPU_MEMCPY, + std::set({ActivityType::CPU_OP, + ActivityType::CPU_INSTANT_EVENT, + ActivityType::USER_ANNOTATION, + ActivityType::GPU_USER_ANNOTATION, + ActivityType::GPU_MEMCPY, ActivityType::GPU_MEMSET, ActivityType::CONCURRENT_KERNEL, ActivityType::EXTERNAL_CORRELATION, + ActivityType::GLOW_RUNTIME, ActivityType::CUDA_RUNTIME})); Config cfg2; - EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES=gpu_memcpy,gpu_MeMsEt,concurrent_kernel")); + EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES=gpu_memcpy,gpu_MeMsEt,kernel")); EXPECT_EQ(cfg2.selectedActivityTypes(), std::set({ActivityType::GPU_MEMCPY, ActivityType::GPU_MEMSET, @@ -102,21 +104,14 @@ TEST(ParseTest, ActivityTypes) { EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES = cuda_Runtime,")); EXPECT_EQ(cfg2.selectedActivityTypes(), - std::set({ActivityType::GPU_MEMCPY, - ActivityType::GPU_MEMSET, - ActivityType::CUDA_RUNTIME, - ActivityType::CONCURRENT_KERNEL})); + std::set({ActivityType::CUDA_RUNTIME})); // Should throw an exception because incorrect activity name EXPECT_FALSE(cfg2.parse("ACTIVITY_TYPES = memcopy,cuda_runtime")); - EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES = external_correlation")); + EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES = cpu_op")); EXPECT_EQ(cfg2.selectedActivityTypes(), - std::set({ActivityType::GPU_MEMCPY, - ActivityType::GPU_MEMSET, - ActivityType::CONCURRENT_KERNEL, - ActivityType::EXTERNAL_CORRELATION, - ActivityType::CUDA_RUNTIME})); + std::set({ActivityType::CPU_OP})); } TEST(ParseTest, SamplePeriod) { diff --git a/libkineto/test/EventProfilerTest.cpp b/libkineto/test/EventProfilerTest.cpp index ef3f46142..ca1f20087 100644 --- a/libkineto/test/EventProfilerTest.cpp +++ b/libkineto/test/EventProfilerTest.cpp @@ -59,7 +59,7 @@ TEST(PercentileTest, Normalize) { TEST(EventTest, SumSamples) { Event ev; ev.instanceCount = 4; - auto t = high_resolution_clock::now(); + auto t = system_clock::now(); ev.addSample(t, {1, 2, 3, 4}); ev.addSample(t, {10, 20, 30, 40}); ev.addSample(t, {100, 200, 300, 400}); @@ -94,7 +94,7 @@ TEST(EventTest, SumSamples) { TEST(EventTest, Percentiles) { Event ev; ev.instanceCount = 4; - auto t = high_resolution_clock::now(); + auto t = system_clock::now(); ev.addSample(t, {3, 2, 1, 4}); ev.addSample(t, {30, 20, 10, 40}); ev.addSample(t, {300, 200, 100, 400}); diff --git a/libkineto/test/MockActivitySubProfiler.cpp b/libkineto/test/MockActivitySubProfiler.cpp new file mode 100644 index 000000000..cecae5cee --- /dev/null +++ b/libkineto/test/MockActivitySubProfiler.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include "test/MockActivitySubProfiler.h" + +namespace libkineto { + +const std::set supported_activities {ActivityType::CPU_OP}; +const std::string profile_name{"MockProfiler"}; + +void MockProfilerSession::processTrace(ActivityLogger& logger) { + for (const auto& activity: activities()) { + activity.log(logger); + } +} + +const std::string& MockActivityProfiler::name() const { + return profile_name; +} + +const std::set& MockActivityProfiler::availableActivities() const { + return supported_activities; +} + +MockActivityProfiler::MockActivityProfiler( + std::vector& activities) : + test_activities_(activities) {}; + +std::unique_ptr MockActivityProfiler::configure( + const std::set& /*activity_types*/, + const std::string& /*config*/) { + auto session = std::make_unique(); + session->set_test_activities(std::move(test_activities_)); + return session; +}; + +std::unique_ptr MockActivityProfiler::configure( + int64_t /*ts_ms*/, + int64_t /*duration_ms*/, + const std::set& activity_types, + const std::string& config) { + return configure(activity_types, config); +}; + +} // namespace libkineto + diff --git a/libkineto/test/MockActivitySubProfiler.h b/libkineto/test/MockActivitySubProfiler.h new file mode 100644 index 000000000..10af468ab --- /dev/null +++ b/libkineto/test/MockActivitySubProfiler.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include + +#include "include/IActivityProfiler.h" + +namespace libkineto { + +class MockProfilerSession: public IActivityProfilerSession { + + public: + explicit MockProfilerSession() {} + + void start() override { + start_count++; + status_ = TraceStatus::RECORDING; + } + + void stop() override { + stop_count++; + status_ = TraceStatus::PROCESSING; + } + + std::vector& activities() override { + return test_activities_; + } + + std::vector errors() override { + return {}; + } + + void processTrace(ActivityLogger& logger) override; + + void set_test_activities(std::vector&& acs) { + test_activities_ = std::move(acs); + } + + int start_count = 0; + int stop_count = 0; + private: + std::vector test_activities_; +}; + + +class MockActivityProfiler: public IActivityProfiler { + + public: + explicit MockActivityProfiler(std::vector& activities); + + const std::string& name() const override; + + const std::set& availableActivities() const override; + + std::unique_ptr configure( + const std::set& activity_types, + const std::string& config = "") override; + + std::unique_ptr configure( + int64_t ts_ms, + int64_t duration_ms, + const std::set& activity_types, + const std::string& config = "") override; + + private: + std::vector test_activities_; +}; + +} // namespace libkineto diff --git a/libkineto/test/PidInfoTest.cpp b/libkineto/test/PidInfoTest.cpp index c3a04342e..324b45e40 100644 --- a/libkineto/test/PidInfoTest.cpp +++ b/libkineto/test/PidInfoTest.cpp @@ -5,10 +5,9 @@ * LICENSE file in the root directory of this source tree. */ -#include "src/ThreadName.h" +#include "include/ThreadUtil.h" #include -#include #include #include @@ -18,54 +17,16 @@ using namespace KINETO_NAMESPACE; TEST(ThreadNameTest, setAndGet) { setThreadName("ThreadNameTest"); - EXPECT_EQ(getThreadName(getpid()), "ThreadNameTest"); + EXPECT_EQ(getThreadName(), "ThreadNameTest"); setThreadName(""); - EXPECT_EQ(getThreadName(getpid()), ""); + EXPECT_EQ(getThreadName(), ""); // Spaces etc are ok setThreadName("Name w/ spaces"); - EXPECT_EQ(getThreadName(getpid()), "Name w/ spaces"); + EXPECT_EQ(getThreadName(), "Name w/ spaces"); // More than 16 chars is not OK setThreadName("More than 16 characters"); - EXPECT_EQ(getThreadName(getpid()), "Name w/ spaces"); + EXPECT_EQ(getThreadName(), "Name w/ spaces"); } - -TEST(ThreadNameTest, invalidThread) { - EXPECT_EQ(getThreadName(123456789), "Unknown"); -} - -TEST(ThreadNameTest, otherThread) { - std::atomic_bool stop_flag; - std::atomic_int tid = 0; - std::thread thread([&stop_flag, &tid]() { - setThreadName("New Thread"); - tid = syscall(SYS_gettid); - while (!stop_flag) {} - }); - while (!tid) {} - EXPECT_EQ(getThreadName(tid), "New Thread"); - stop_flag = true; - thread.join(); -} - -TEST(ThreadNameTest, deadThread) { - std::atomic_bool stop_flag; - std::atomic_int tid = 0; - std::thread thread([&stop_flag, &tid]() { - setThreadName("New Thread"); - tid = syscall(SYS_gettid); - while (!stop_flag) {} - }); - while (!tid) {} - stop_flag = true; - thread.join(); - // There appears to be a delay before the thread info is - // removed from proc - we can therefore expect either - // "Unknown" or "New Thread" to be returned. - std::string name = getThreadName(tid); - EXPECT_TRUE(name == "Unknown" || name == "New Thread") - << "Where name = " << name; -} -