diff --git a/libkineto/include/ActivityType.h b/libkineto/include/ActivityType.h index 729f2d5b2..1ecf1f222 100644 --- a/libkineto/include/ActivityType.h +++ b/libkineto/include/ActivityType.h @@ -7,16 +7,29 @@ #pragma once +#include +#include + namespace libkineto { enum class ActivityType { CPU_OP = 0, // cpu side ops + USER_ANNOTATION, + GPU_USER_ANNOTATION, GPU_MEMCPY, GPU_MEMSET, CONCURRENT_KERNEL, // on-device kernels EXTERNAL_CORRELATION, CUDA_RUNTIME, // host side cuda runtime events CPU_INSTANT_EVENT, // host side point-like events + ENUM_COUNT }; +const char* toString(ActivityType t); +ActivityType toActivityType(const std::string& str); + +// Return an array of all activity types except COUNT +constexpr int activityTypeCount = (int)ActivityType::ENUM_COUNT; +const std::array activityTypes(); + } // namespace libkineto diff --git a/libkineto/include/GenericTraceActivity.h b/libkineto/include/GenericTraceActivity.h index 0ffdaad9a..9eb7a6cb9 100644 --- a/libkineto/include/GenericTraceActivity.h +++ b/libkineto/include/GenericTraceActivity.h @@ -14,19 +14,28 @@ #include "ThreadUtil.h" #include "TraceActivity.h" +#include "TraceSpan.h" namespace libkineto { // @lint-ignore-every CLANGTIDY cppcoreguidelines-non-private-member-variables-in-classes // @lint-ignore-every CLANGTIDY cppcoreguidelines-pro-type-member-init -struct GenericTraceActivity : TraceActivity { +class GenericTraceActivity : public TraceActivity { + + public: + GenericTraceActivity() = delete; + + GenericTraceActivity( + const TraceSpan& trace, ActivityType type, const std::string& name) + : activityType(type), activityName(name), traceSpan_(&trace) { + } int64_t deviceId() const override { return device; } int64_t resourceId() const override { - return sysThreadId; + return resource; } int64_t timestamp() const override { @@ -38,7 +47,7 @@ struct GenericTraceActivity : TraceActivity { } int64_t correlationId() const override { - return correlation; + return id; } ActivityType type() const override { @@ -53,6 +62,10 @@ struct GenericTraceActivity : TraceActivity { return nullptr; } + const TraceSpan* traceSpan() const override { + return traceSpan_; + } + void log(ActivityLogger& logger) const override; //Encode client side metadata as a key/value string. @@ -68,13 +81,14 @@ struct GenericTraceActivity : TraceActivity { int64_t startTime{0}; int64_t endTime{0}; - int64_t correlation{0}; - int device{-1}; - int32_t sysThreadId{0}; - std::string activityName; + int32_t id{0}; + int32_t device{0}; + int32_t resource{0}; ActivityType activityType; + std::string activityName; private: + const TraceSpan* traceSpan_; std::vector metadata_; }; diff --git a/libkineto/include/TraceActivity.h b/libkineto/include/TraceActivity.h index 5ac17ab25..fbf0dc673 100644 --- a/libkineto/include/TraceActivity.h +++ b/libkineto/include/TraceActivity.h @@ -14,6 +14,7 @@ namespace libkineto { class ActivityLogger; +struct TraceSpan; // Generic activity interface is borrowed from tensorboard protobuf format. struct TraceActivity { @@ -33,6 +34,8 @@ struct TraceActivity { virtual const std::string name() const = 0; // Optional linked activity virtual const TraceActivity* linkedActivity() const = 0; + // Optional containing trace object + virtual const TraceSpan* traceSpan() const = 0; // Log activity virtual void log(ActivityLogger& logger) const = 0; diff --git a/libkineto/include/TraceSpan.h b/libkineto/include/TraceSpan.h index eac3e793a..f33e18fb9 100644 --- a/libkineto/include/TraceSpan.h +++ b/libkineto/include/TraceSpan.h @@ -7,12 +7,26 @@ #pragma once +#include #include #include namespace libkineto { struct TraceSpan { + TraceSpan() = delete; + TraceSpan( + int64_t startTime, int64_t endTime, std::string name) + : startTime(startTime), endTime(endTime), name(std::move(name)) { + } + TraceSpan( + int opCount, int it, std::string name, std::string prefix) + : opCount(opCount), + iteration(it), + name(std::move(name)), + prefix(std::move(prefix)) { + } + // FIXME: change to duration? int64_t startTime{0}; int64_t endTime{0}; @@ -20,8 +34,10 @@ struct TraceSpan { int iteration{-1}; // Name is used to identify timeline std::string name; - // Prefix used to distinguish sub-nets on the same timeline + // Prefix used to distinguish trace spans on the same timeline std::string prefix; + // Tracked by profiler for iteration trigger + bool tracked{false}; }; } // namespace libkineto diff --git a/libkineto/include/libkineto.h b/libkineto/include/libkineto.h index e07b68aa0..f0ab1d619 100644 --- a/libkineto/include/libkineto.h +++ b/libkineto/include/libkineto.h @@ -38,7 +38,7 @@ namespace libkineto { class Config; struct CpuTraceBuffer { - TraceSpan span; + TraceSpan span{0, 0, "none"}; int gpuOpCount; std::vector activities; }; diff --git a/libkineto/src/ActivityProfiler.cpp b/libkineto/src/ActivityProfiler.cpp index 34716cf45..81f779f2f 100644 --- a/libkineto/src/ActivityProfiler.cpp +++ b/libkineto/src/ActivityProfiler.cpp @@ -38,7 +38,7 @@ using std::string; namespace KINETO_NAMESPACE { bool ActivityProfiler::iterationTargetMatch( - const libkineto::CpuTraceBuffer& trace) { + libkineto::CpuTraceBuffer& trace) { const string& name = trace.span.name; bool match = (name == netIterationsTarget_); if (!match && applyNetFilterInternal(name) && @@ -53,6 +53,7 @@ bool ActivityProfiler::iterationTargetMatch( } if (match) { netIterationsTarget_ = name; + trace.span.tracked = true; LOG(INFO) << "Tracking net " << name << " for " << netIterationsTargetCount_ << " iterations"; } @@ -163,8 +164,7 @@ void ActivityProfiler::processTraceInternal(ActivityLogger& logger) { ActivityProfiler::CpuGpuSpanPair& ActivityProfiler::recordTraceSpan( TraceSpan& span, int gpuOpCount) { - TraceSpan gpu_span{ - 0, 0, gpuOpCount, span.iteration, span.name, "GPU: "}; + TraceSpan gpu_span(gpuOpCount, span.iteration, span.name, "GPU: "); auto& iterations = traceSpans_[span.name]; iterations.push_back({span, gpu_span}); return iterations.back(); @@ -183,8 +183,8 @@ void ActivityProfiler::processCpuTrace( TraceSpan& cpu_span = span_pair.first; for (auto const& act : cpuTrace.activities) { VLOG(2) << act.correlationId() << ": OP " << act.activityName; - if (logTrace) { - logger.handleCpuActivity(act, cpu_span); + if (logTrace && config_->selectedActivityTypes().count(act.type())) { + act.log(logger); } // Stash event so we can look it up later when processing GPU trace externalEvents_.insertEvent(&act); @@ -192,9 +192,6 @@ void ActivityProfiler::processCpuTrace( } if (logTrace) { logger.handleTraceSpan(cpu_span); - if (cpu_span.name == netIterationsTarget_) { - logger.handleIterationStart(cpu_span); - } } else { disabledTraceSpans_.insert(cpu_span.name); } @@ -203,41 +200,21 @@ void ActivityProfiler::processCpuTrace( #ifdef HAS_CUPTI inline void ActivityProfiler::handleCorrelationActivity( const CUpti_ActivityExternalCorrelation* correlation) { - switch(correlation->externalKind) { - case CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM0: - externalEvents_.addCorrelation( - correlation->externalId, - correlation->correlationId, - ExternalEventMap::CorrelationFlowType::Default); - break; - case CUPTI_EXTERNAL_CORRELATION_KIND_CUSTOM1: - externalEvents_.addCorrelation( - correlation->externalId, - correlation->correlationId, - ExternalEventMap::CorrelationFlowType::User); - break; - default: - LOG(ERROR) << "Received correlation activity with undefined kind: " - << correlation->externalKind; - break; - } - VLOG(2) << correlation->correlationId - << ": CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION"; + externalEvents_.addCorrelation( + correlation->externalId, correlation->correlationId); } #endif // HAS_CUPTI const libkineto::GenericTraceActivity& -ActivityProfiler::ExternalEventMap::getGenericTraceActivity( - uint32_t id, CorrelationFlowType flowType) { - static const libkineto::GenericTraceActivity nullOp_{}; +ActivityProfiler::ExternalEventMap::correlatedActivity(uint32_t id) { + static const libkineto::GenericTraceActivity nullOp_( + defaultTraceSpan().first, ActivityType::CPU_OP, "NULL"); - auto& correlationMap = getCorrelationMap(flowType); - - auto* res = events_[correlationMap[id]]; + auto* res = events_[correlationMap_[id]]; if (res == nullptr) { // Entry may be missing because cpu trace hasn't been processed yet // Insert a dummy element so that we can check for this in insertEvent - events_[correlationMap[id]] = &nullOp_; + events_[correlationMap_[id]] = &nullOp_; res = &nullOp_; } return *res; @@ -253,52 +230,59 @@ void ActivityProfiler::ExternalEventMap::insertEvent( } void ActivityProfiler::ExternalEventMap::addCorrelation( - uint64_t external_id, uint32_t cuda_id, CorrelationFlowType flowType) { - switch(flowType){ - case Default: - defaultCorrelationMap_[cuda_id] = external_id; - break; - case User: - userCorrelationMap_[cuda_id] = external_id; - break; - } + uint64_t external_id, uint32_t cuda_id) { + correlationMap_[cuda_id] = external_id; } -static void initUserGpuSpan(GenericTraceActivity& userTraceActivity, - const libkineto::TraceActivity& cpuTraceActivity, - const libkineto::TraceActivity& gpuTraceActivity) { - userTraceActivity.device = gpuTraceActivity.deviceId(); - userTraceActivity.startTime = gpuTraceActivity.timestamp(); - userTraceActivity.sysThreadId = gpuTraceActivity.resourceId(); - userTraceActivity.endTime = gpuTraceActivity.timestamp() + gpuTraceActivity.duration(); - userTraceActivity.correlation = cpuTraceActivity.correlationId(); - userTraceActivity.activityType = cpuTraceActivity.type(); - userTraceActivity.activityName = cpuTraceActivity.name(); +static GenericTraceActivity createUserGpuSpan( + const libkineto::TraceActivity& cpuTraceActivity, + const libkineto::TraceActivity& gpuTraceActivity) { + GenericTraceActivity res( + *cpuTraceActivity.traceSpan(), + ActivityType::GPU_USER_ANNOTATION, + cpuTraceActivity.name()); + res.startTime = gpuTraceActivity.timestamp(); + res.device = gpuTraceActivity.deviceId(); + res.resource = gpuTraceActivity.resourceId(); + res.endTime = + gpuTraceActivity.timestamp() + gpuTraceActivity.duration(); + res.id = cpuTraceActivity.correlationId(); + return res; } void ActivityProfiler::GpuUserEventMap::insertOrExtendEvent( - const TraceActivity& cpuTraceActivity, - const TraceActivity& gpuTraceActivity) { - StreamKey key(gpuTraceActivity.deviceId(), gpuTraceActivity.resourceId()); - CorrelationSpanMap& correlationSpanMap = streamSpanMap[key]; - if (correlationSpanMap.count(cpuTraceActivity.correlationId()) == 0) { - GenericTraceActivity& userTraceActivity = correlationSpanMap[cpuTraceActivity.correlationId()]; - initUserGpuSpan(userTraceActivity, cpuTraceActivity, gpuTraceActivity); - } - GenericTraceActivity& userTraceActivity = correlationSpanMap[cpuTraceActivity.correlationId()]; - if (gpuTraceActivity.timestamp() < userTraceActivity.startTime || userTraceActivity.startTime == 0) { - userTraceActivity.startTime = gpuTraceActivity.timestamp(); - } - if ((gpuTraceActivity.timestamp() + gpuTraceActivity.duration()) > userTraceActivity.endTime) { - userTraceActivity.endTime = gpuTraceActivity.timestamp() + gpuTraceActivity.duration(); + const TraceActivity&, + const TraceActivity& gpuActivity) { + const TraceActivity& cpuActivity = *gpuActivity.linkedActivity(); + StreamKey key(gpuActivity.deviceId(), gpuActivity.resourceId()); + CorrelationSpanMap& correlationSpanMap = streamSpanMap_[key]; + auto it = correlationSpanMap.find(cpuActivity.correlationId()); + if (it == correlationSpanMap.end()) { + auto it_success = correlationSpanMap.insert({ + cpuActivity.correlationId(), createUserGpuSpan(cpuActivity, gpuActivity) + }); + it = it_success.first; + } + GenericTraceActivity& span = it->second; + if (gpuActivity.timestamp() < span.startTime || span.startTime == 0) { + span.startTime = gpuActivity.timestamp(); + } + int64_t gpu_activity_end = gpuActivity.timestamp() + gpuActivity.duration(); + if (gpu_activity_end > span.endTime) { + span.endTime = gpu_activity_end; } } +const ActivityProfiler::CpuGpuSpanPair& ActivityProfiler::defaultTraceSpan() { + static TraceSpan span(0, 0, "Unknown", ""); + static CpuGpuSpanPair span_pair(span, span); + return span_pair; +} + void ActivityProfiler::GpuUserEventMap::logEvents(ActivityLogger *logger) { - for (auto const& streamMapPair : streamSpanMap) { + for (auto const& streamMapPair : streamSpanMap_) { for (auto const& correlationSpanPair : streamMapPair.second) { - logger->handleGenericActivity( - correlationSpanPair.second); + correlationSpanPair.second.log(*logger); } } } @@ -330,8 +314,7 @@ inline void ActivityProfiler::handleRuntimeActivity( << ": CUPTI_ACTIVITY_KIND_RUNTIME, cbid=" << activity->cbid << " tid=" << activity->threadId; const GenericTraceActivity& ext = - externalEvents_.getGenericTraceActivity(activity->correlationId, - ExternalEventMap::CorrelationFlowType::Default); + externalEvents_.correlatedActivity(activity->correlationId); int32_t tid = activity->threadId; const auto& it = threadInfo_.find(tid); if (it != threadInfo_.end()) { @@ -395,26 +378,31 @@ inline void ActivityProfiler::handleGpuActivity( if (!loggingDisabled(ext)) { act.log(*logger); updateGpuNetSpan(act); + /* const GenericTraceActivity& extUser = - externalEvents_.getGenericTraceActivity(act.correlationId(), - ExternalEventMap::CorrelationFlowType::User); + externalEvents_.correlatedActivity(act.correlationId()); // Correlated CPU activity cannot have timestamp greater than the GPU activity's if (!timestampsInCorrectOrder(extUser, act)) { return; } - if (extUser.correlationId() != 0) { VLOG(2) << extUser.correlationId() << "," << act.correlationId() << " (user): "<< act.name(); - gpuUserEventMap_.insertOrExtendEvent(extUser, act); +*/ + if (config_->selectedActivityTypes().count(ActivityType::GPU_USER_ANNOTATION) && + act.linkedActivity() && + act.linkedActivity()->type() == ActivityType::USER_ANNOTATION) { + //gpuUserEventMap_.insertOrExtendEvent(act, act); } +// } } } template -inline void ActivityProfiler::handleGpuActivity(const T* act, ActivityLogger* logger) { - const GenericTraceActivity& extDefault = externalEvents_.getGenericTraceActivity(act->correlationId, - ExternalEventMap::CorrelationFlowType::Default); +inline void ActivityProfiler::handleGpuActivity( + const T* act, ActivityLogger* logger) { + const GenericTraceActivity& extDefault = + externalEvents_.correlatedActivity(act->correlationId); handleGpuActivity(GpuActivity(act, extDefault), logger); } diff --git a/libkineto/src/ActivityProfiler.h b/libkineto/src/ActivityProfiler.h index 653f62c35..057888b2a 100644 --- a/libkineto/src/ActivityProfiler.h +++ b/libkineto/src/ActivityProfiler.h @@ -117,27 +117,25 @@ class ActivityProfiler { metadata_[key] = value; } + protected: + + using CpuGpuSpanPair = std::pair; + static const CpuGpuSpanPair& defaultTraceSpan(); + private: class ExternalEventMap { public: - enum CorrelationFlowType { - // Default flow type - Default, - // User annotated flow type - User - }; // The correlation id of the GPU activity - const libkineto::GenericTraceActivity& getGenericTraceActivity( - uint32_t correlation_id, CorrelationFlowType flowType); + const libkineto::GenericTraceActivity& correlatedActivity( + uint32_t correlation_id); void insertEvent(const libkineto::GenericTraceActivity* op); - void addCorrelation(uint64_t external_id, uint32_t cuda_id, CorrelationFlowType flowType); + void addCorrelation(uint64_t external_id, uint32_t cuda_id); void clear() { events_.clear(); - defaultCorrelationMap_.clear(); - userCorrelationMap_.clear(); + correlationMap_.clear(); } private: @@ -146,7 +144,7 @@ class ActivityProfiler { // but this class also fully owns the objects it is pointing to so // it's not so bad. This is done for performance reasons and is an // implementation detail of this class that might change. - std::unordered_map + std::unordered_map events_; // Cuda correlation id -> external correlation id for default events @@ -157,23 +155,7 @@ class ActivityProfiler { std::unordered_map< uint32_t, // Cuda correlation ID uint64_t> // External correlation ID - defaultCorrelationMap_; - - // Cuda correlation id -> external correlation id for user annotated - // events - // CUPTI provides a mechanism for correlating Cuda events to arbitrary - // external events, e.g.operator events from Caffe2. - // It also marks GPU activities with the Cuda event correlation ID. - // So by connecting the two, we get the complete picture. - std::unordered_map< - uint32_t, // Cuda correlation ID - uint64_t> // External correlation ID - userCorrelationMap_; - - std::unordered_map& - getCorrelationMap(CorrelationFlowType flowType) { - return flowType == User ? userCorrelationMap_ : defaultCorrelationMap_; - } + correlationMap_; }; // Map of gpu activities to user defined events @@ -188,16 +170,17 @@ class ActivityProfiler { void logEvents(ActivityLogger *logger); void clear() { - streamSpanMap.clear(); + streamSpanMap_.clear(); } private: // device id and stream name - typedef std::pair StreamKey; + using StreamKey = std::pair; // map of correlation id to TraceSpan - typedef std::unordered_map CorrelationSpanMap; - std::map streamSpanMap; + using CorrelationSpanMap = + std::unordered_map; + std::map streamSpanMap_; }; GpuUserEventMap gpuUserEventMap_; @@ -234,13 +217,11 @@ class ActivityProfiler { // Record client trace span for subsequent lookups from activities // Also creates a corresponding GPU-side span. - using CpuGpuSpanPair = std::pair; CpuGpuSpanPair& recordTraceSpan(TraceSpan& span, int gpuOpCount); // Returns true if net name is to be tracked for a specified number of // iterations. - bool iterationTargetMatch( - const libkineto::CpuTraceBuffer& trace); + bool iterationTargetMatch(libkineto::CpuTraceBuffer& trace); // net name to id int netId(const std::string& netName); @@ -264,7 +245,7 @@ class ActivityProfiler { // Is logging disabled for this event? // Logging can be disabled due to operator count, net name filter etc. - inline bool loggingDisabled(const libkineto::TraceActivity& act) { + inline bool loggingDisabled(const libkineto::TraceActivity& act) const { const auto& it = clientActivityTraceMap_.find(act.correlationId()); return it != clientActivityTraceMap_.end() && disabledTraceSpans_.find(it->second->first.name) != @@ -314,7 +295,8 @@ class ActivityProfiler { // Maintain a map of client trace activity to trace span. // Maps correlation id -> TraceSpan* held by traceSpans_. - std::unordered_map clientActivityTraceMap_; + using ActivityTraceMap = std::unordered_map; + ActivityTraceMap clientActivityTraceMap_; // Cache thread names and system thread ids for pthread ids // Note we're using the lower 32 bits of the (opaque) pthread id diff --git a/libkineto/src/ActivityType.cpp b/libkineto/src/ActivityType.cpp new file mode 100644 index 000000000..5fbb276e2 --- /dev/null +++ b/libkineto/src/ActivityType.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * All rights reserved. + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "ActivityType.h" + +#include + +namespace libkineto { + +struct ActivityTypeName { + const char* name; + ActivityType type; +}; + +static constexpr std::array map{{ + {"cpu_op", ActivityType::CPU_OP}, + {"user_annotation", ActivityType::USER_ANNOTATION}, + {"gpu_user_Annotation", ActivityType::GPU_USER_ANNOTATION}, + {"gpu_memcpy", ActivityType::GPU_MEMCPY}, + {"gpu_memset", ActivityType::GPU_MEMSET}, + {"kernel", ActivityType::CONCURRENT_KERNEL}, + {"external_correlation", ActivityType::EXTERNAL_CORRELATION}, + {"cuda_runtime", ActivityType::CUDA_RUNTIME}, + {"cpu_instant_event", ActivityType::CPU_INSTANT_EVENT}, + {"ENUM_COUNT", ActivityType::ENUM_COUNT} +}}; + +static constexpr bool matchingOrder(int idx = 0) { + return map[idx].type == ActivityType::ENUM_COUNT || + ((idx == (int) map[idx].type) && matchingOrder(idx + 1)); +} +static_assert(matchingOrder(), "ActivityTypeName map is out of order"); + +const char* toString(ActivityType t) { + return map[(int)t].name; +} + +ActivityType toActivityType(const std::string& str) { + for (int i = 0; i < activityTypeCount; i++) { + if (str == map[i].name) { + return map[i].type; + } + } + throw std::invalid_argument(fmt::format("Invalid activity type: {}", str)); +} + +const std::array activityTypes() { + std::array res; + for (int i = 0; i < activityTypeCount; i++) { + res[i] = map[i].type; + } + return res; +} + +} // namespace libkineto diff --git a/libkineto/src/Config.cpp b/libkineto/src/Config.cpp index bb3927bc4..154ed5ac4 100644 --- a/libkineto/src/Config.cpp +++ b/libkineto/src/Config.cpp @@ -73,14 +73,6 @@ const string kActivitiesWarmupDurationSecsKey = "ACTIVITIES_WARMUP_PERIOD_SECS"; const string kActivitiesMaxGpuBufferSizeKey = "ACTIVITIES_MAX_GPU_BUFFER_SIZE_MB"; -// Valid configuration file entries for activity types -const string kActivityCpuOp = "cpu_op"; -const string kActivityMemcpy = "gpu_memcpy"; -const string kActivityMemset = "gpu_memset"; -const string kActivityConcurrentKernel = "concurrent_kernel"; -const string kActivityExternalCorrelation = "external_correlation"; -const string kActivityRuntime = "cuda_runtime"; - const string kDefaultLogFileFmt = "/tmp/libkineto_activities_{}.json"; // Common @@ -206,24 +198,8 @@ void Config::setActivityTypes( for (const auto& activity : selected_activities) { if (activity == "") { continue; - } else if (activity == kActivityCpuOp) { - selectedActivityTypes_.insert(ActivityType::CPU_OP); - } else if (activity == kActivityMemcpy) { - selectedActivityTypes_.insert(ActivityType::GPU_MEMCPY); - } else if (activity == kActivityMemset) { - selectedActivityTypes_.insert(ActivityType::GPU_MEMSET); - } else if (activity == kActivityConcurrentKernel) { - selectedActivityTypes_.insert(ActivityType::CONCURRENT_KERNEL); - } else if (activity == kActivityExternalCorrelation) { - selectedActivityTypes_.insert(ActivityType::EXTERNAL_CORRELATION); - } else if (activity == kActivityRuntime) { - selectedActivityTypes_.insert(ActivityType::CUDA_RUNTIME); - } else { - throw std::invalid_argument(fmt::format( - "Invalid activity type selected: {}", - activity - )); } + selectedActivityTypes_.insert(toActivityType(activity)); } } } @@ -236,9 +212,6 @@ bool Config::handleOption(const std::string& name, std::string& val) { } else if (name == kMetricsKey) { vector metric_names = splitAndTrim(val, ','); metricNames_.insert(metric_names.begin(), metric_names.end()); - } else if (name == kActivityTypesKey) { - vector activity_types = splitAndTrim(toLower(val), ','); - setActivityTypes(activity_types); } else if (name == kSamplePeriodKey) { samplePeriod_ = milliseconds(toInt32(val)); } else if (name == kMultiplexPeriodKey) { @@ -265,6 +238,9 @@ bool Config::handleOption(const std::string& name, std::string& val) { activitiesOnDemandDuration_ = duration_cast(seconds(toInt32(val))); activitiesOnDemandTimestamp_ = timestamp(); + } else if (name == kActivityTypesKey) { + vector activity_types = splitAndTrim(toLower(val), ','); + setActivityTypes(activity_types); } else if (name == kActivitiesDurationMsecsKey) { activitiesOnDemandDuration_ = milliseconds(toInt32(val)); activitiesOnDemandTimestamp_ = timestamp(); @@ -404,33 +380,12 @@ void Config::printActivityProfilerConfig(std::ostream& s) const { s << "Max GPU buffer size: " << activitiesMaxGpuBufferSize() / 1024 / 1024 << "MB" << std::endl; - s << "Enabled activities: "; + std::vector activities; for (const auto& activity : selectedActivityTypes_) { - switch(activity){ - case ActivityType::CPU_OP: - s << kActivityCpuOp << " "; - break; - case ActivityType::GPU_MEMCPY: - s << kActivityMemcpy << " "; - break; - case ActivityType::GPU_MEMSET: - s << kActivityMemset << " "; - break; - case ActivityType::CONCURRENT_KERNEL: - s << kActivityConcurrentKernel << " "; - break; - case ActivityType::EXTERNAL_CORRELATION: - s << kActivityExternalCorrelation << " "; - break; - case ActivityType::CUDA_RUNTIME: - s << kActivityRuntime << " "; - break; - default: - s << "UNKNOWN_ACTIVITY_NAME" << " "; - break; - } + activities.push_back(toString(activity)); } - s << std::endl; + s << "Enabled activities: " + << fmt::format("{}", fmt::join(activities, ",")) << std::endl; AbstractConfig::printActivityProfilerConfig(s); } diff --git a/libkineto/src/Config.h b/libkineto/src/Config.h index 7cde942ec..e8c94631f 100644 --- a/libkineto/src/Config.h +++ b/libkineto/src/Config.h @@ -303,12 +303,9 @@ class Config : public AbstractConfig { // Sets the default activity types to be traced void selectDefaultActivityTypes() { // If the user has not specified an activity list, add all types - selectedActivityTypes_.insert(ActivityType::CPU_OP); - selectedActivityTypes_.insert(ActivityType::GPU_MEMCPY); - selectedActivityTypes_.insert(ActivityType::GPU_MEMSET); - selectedActivityTypes_.insert(ActivityType::CONCURRENT_KERNEL); - selectedActivityTypes_.insert(ActivityType::EXTERNAL_CORRELATION); - selectedActivityTypes_.insert(ActivityType::CUDA_RUNTIME); + for (ActivityType t : activityTypes()) { + selectedActivityTypes_.insert(t); + } } int verboseLogLevel_; diff --git a/libkineto/src/CuptiActivity.h b/libkineto/src/CuptiActivity.h index fc5ce59f5..4ac38fe62 100644 --- a/libkineto/src/CuptiActivity.h +++ b/libkineto/src/CuptiActivity.h @@ -20,6 +20,7 @@ namespace libkineto { namespace KINETO_NAMESPACE { using namespace libkineto; +struct TraceSpan; // These classes wrap the various CUPTI activity types // into subclasses of TraceActivity so that they can all be accessed @@ -39,6 +40,7 @@ struct CuptiActivity : public TraceActivity { int64_t correlationId() const override {return activity_.correlationId;} const T& raw() const {return activity_;} const TraceActivity* linkedActivity() const override {return &linked_;} + const TraceSpan* traceSpan() const override {return nullptr;} protected: const T& activity_; diff --git a/libkineto/src/CuptiActivityInterface.cpp b/libkineto/src/CuptiActivityInterface.cpp index 979682eb3..987653964 100644 --- a/libkineto/src/CuptiActivityInterface.cpp +++ b/libkineto/src/CuptiActivityInterface.cpp @@ -30,6 +30,9 @@ CuptiActivityInterface& CuptiActivityInterface::singleton() { void CuptiActivityInterface::pushCorrelationID(int id, CorrelationFlowType type) { #ifdef HAS_CUPTI + if (!singleton().externalCorrelationEnabled_) { + return; + } VLOG(2) << "pushCorrelationID(" << id << ")"; switch(type) { case Default: @@ -45,6 +48,9 @@ void CuptiActivityInterface::pushCorrelationID(int id, CorrelationFlowType type) void CuptiActivityInterface::popCorrelationID(CorrelationFlowType type) { #ifdef HAS_CUPTI + if (!singleton().externalCorrelationEnabled_) { + return; + } switch(type) { case Default: CUPTI_CALL(cuptiActivityPopExternalCorrelationId( @@ -276,6 +282,7 @@ void CuptiActivityInterface::enableCuptiActivities( cuptiActivityRegisterCallbacks(bufferRequestedTrampoline, bufferCompletedTrampoline)); } + externalCorrelationEnabled_ = false; for (const auto& activity : selected_activities) { if (activity == ActivityType::GPU_MEMCPY) { CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); @@ -288,6 +295,7 @@ void CuptiActivityInterface::enableCuptiActivities( } if (activity == ActivityType::EXTERNAL_CORRELATION) { CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION)); + externalCorrelationEnabled_ = true; } if (activity == ActivityType::CUDA_RUNTIME) { CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); @@ -319,6 +327,7 @@ void CuptiActivityInterface::disableCuptiActivities( CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); } } + externalCorrelationEnabled_ = false; #endif } diff --git a/libkineto/src/CuptiActivityInterface.h b/libkineto/src/CuptiActivityInterface.h index 28392ae3c..af02570df 100644 --- a/libkineto/src/CuptiActivityInterface.h +++ b/libkineto/src/CuptiActivityInterface.h @@ -84,6 +84,7 @@ class CuptiActivityInterface { CuptiActivityBufferMap allocatedGpuTraceBuffers_; std::unique_ptr readyGpuTraceBuffers_; std::mutex mutex_; + bool externalCorrelationEnabled_{false}; protected: #ifdef HAS_CUPTI diff --git a/libkineto/src/GenericTraceActivity.cpp b/libkineto/src/GenericTraceActivity.cpp index 56e361776..8df6581a6 100644 --- a/libkineto/src/GenericTraceActivity.cpp +++ b/libkineto/src/GenericTraceActivity.cpp @@ -10,11 +10,6 @@ namespace libkineto { void GenericTraceActivity::log(ActivityLogger& logger) const { - // TODO(T89833634): Merge handleGenericTraceActivity and handleCpuActivity - if (activityType == ActivityType::CPU_OP) { - return; - } - logger.handleGenericActivity(*this); } } // namespace libkineto diff --git a/libkineto/src/output_base.h b/libkineto/src/output_base.h index 3121e2d0f..050a0828b 100644 --- a/libkineto/src/output_base.h +++ b/libkineto/src/output_base.h @@ -44,14 +44,8 @@ class ActivityLogger { virtual void handleTraceSpan(const TraceSpan& span) = 0; - virtual void handleIterationStart(const TraceSpan& span) = 0; - - virtual void handleCpuActivity( - const libkineto::GenericTraceActivity& activity, - const TraceSpan& span) = 0; - virtual void handleGenericActivity( - const GenericTraceActivity& activity) = 0; + const libkineto::GenericTraceActivity& activity) = 0; #ifdef HAS_CUPTI virtual void handleRuntimeActivity(const RuntimeActivity& activity) = 0; diff --git a/libkineto/src/output_json.cpp b/libkineto/src/output_json.cpp index de6fa9832..76f0bca0d 100644 --- a/libkineto/src/output_json.cpp +++ b/libkineto/src/output_json.cpp @@ -155,9 +155,13 @@ void ChromeTraceLogger::handleTraceSpan(const TraceSpan& span) { span.prefix, span.name, span.iteration, span.opCount); // clang-format on + + if (span.tracked) { + addIterationMarker(span); + } } -void ChromeTraceLogger::handleIterationStart(const TraceSpan& span) { +void ChromeTraceLogger::addIterationMarker(const TraceSpan& span) { if (!traceOf_) { return; } @@ -173,12 +177,13 @@ void ChromeTraceLogger::handleIterationStart(const TraceSpan& span) { // clang-format on } -static std::string traceActivityJson(const TraceActivity& activity, std::string tidPrefix) { +static std::string traceActivityJson( + const TraceActivity& activity, std::string tid) { // clang-format off return fmt::format(R"JSON( - "name": "{}", "pid": {}, "tid": "{}{}", + "name": "{}", "pid": {}, "tid": "{}", "ts": {}, "dur": {})JSON", - activity.name(), activity.deviceId(), tidPrefix, (uint32_t)activity.resourceId(), + activity.name(), activity.deviceId(), tid, activity.timestamp(), activity.duration()); // clang-format on } @@ -202,9 +207,8 @@ void ChromeTraceLogger::handleGenericInstantEvent( op.timestamp(), op.getMetadata()); } -void ChromeTraceLogger::handleCpuActivity( - const libkineto::GenericTraceActivity& op, - const TraceSpan& span) { +void ChromeTraceLogger::handleGenericActivity( + const libkineto::GenericTraceActivity& op) { if (!traceOf_) { return; } @@ -219,48 +223,29 @@ void ChromeTraceLogger::handleCpuActivity( if (op_metadata.find_first_not_of(" \t\n") != std::string::npos) { separator = ","; } + const std::string tid = + op.type() == ActivityType::GPU_USER_ANNOTATION ? + fmt::format("stream {} annotations", op.resourceId()) : + fmt::format("{}", op.resourceId()); + // clang-format off traceOf_ << fmt::format(R"JSON( {{ - "ph": "X", "cat": "Operator", {}, + "ph": "X", "cat": "{}", {}, "args": {{ - "Device": {}, "External id": {}, - "Trace name": "{}", "Trace iteration": {} {} - {} + "External id": {}, + "Trace name": "{}", "Trace iteration": {}{} + {} }} }},)JSON", - traceActivityJson(op, ""), + toString(op.type()), traceActivityJson(op, tid), // args - op.device, op.correlation, - span.name, span.iteration, - separator, + op.id, + op.traceSpan()->name, op.traceSpan()->iteration, separator, op_metadata); // clang-format on } -void ChromeTraceLogger::handleGenericActivity( - const GenericTraceActivity& op) { - if (!traceOf_) { - return; - } - - // FIXME: Make cat and tid customizable - // clang-format off - traceOf_ << fmt::format(R"JSON( - {{ - "ph": "X", "cat": "User", "name": "{}", - "pid": {}, "tid": "stream {} user", - "ts": {}, "dur": {}, - "args": {{ - "External id": {} - }} - }},)JSON", - op.name(), op.deviceId(), op.resourceId(), - op.timestamp(), op.duration(), - op.correlationId()); - // clang-format on -} - #ifdef HAS_CUPTI void ChromeTraceLogger::handleLinkStart(const RuntimeActivity& s) { if (!traceOf_) { @@ -309,7 +294,7 @@ void ChromeTraceLogger::handleRuntimeActivity( "external id": {}, "external ts": {} }} }},)JSON", - traceActivityJson(activity, ""), + traceActivityJson(activity, fmt::format("{}", activity.resourceId())), // args cbid, activity.raw().correlationId, ext.correlationId(), ext.timestamp()); @@ -328,6 +313,10 @@ void ChromeTraceLogger::handleRuntimeActivity( } } +static std::string streamName(const TraceActivity& act) { + return fmt::format("stream {}", act.resourceId()); +} + // GPU side kernel activity void ChromeTraceLogger::handleGpuActivity( const GpuActivity& activity) { @@ -375,7 +364,7 @@ void ChromeTraceLogger::handleGpuActivity( "est. achieved occupancy %": {} }} }},)JSON", - traceActivityJson(activity, "stream "), + traceActivityJson(activity, streamName(activity)), // args us(kernel->queued), kernel->deviceId, kernel->contextId, kernel->streamId, kernel->correlationId, ext.correlationId(), @@ -414,7 +403,7 @@ void ChromeTraceLogger::handleGpuActivity( "bytes": {}, "memory bandwidth (GB/s)": {} }} }},)JSON", - traceActivityJson(activity, "stream "), + traceActivityJson(activity, streamName(activity)), // args memcpy.deviceId, memcpy.contextId, memcpy.streamId, memcpy.correlationId, ext.correlationId(), @@ -443,7 +432,7 @@ void ChromeTraceLogger::handleGpuActivity( "bytes": {}, "memory bandwidth (GB/s)": {} }} }},)JSON", - traceActivityJson(activity, "stream "), + traceActivityJson(activity, streamName(activity)), // args memcpy.srcDeviceId, memcpy.deviceId, memcpy.dstDeviceId, memcpy.srcContextId, memcpy.contextId, memcpy.dstContextId, @@ -471,7 +460,7 @@ void ChromeTraceLogger::handleGpuActivity( "bytes": {}, "memory bandwidth (GB/s)": {} }} }},)JSON", - traceActivityJson(activity, "stream "), + traceActivityJson(activity, streamName(activity)), // args memset.deviceId, memset.contextId, memset.streamId, memset.correlationId, ext.correlationId(), diff --git a/libkineto/src/output_json.h b/libkineto/src/output_json.h index a675321cd..a0f2e6ae0 100644 --- a/libkineto/src/output_json.h +++ b/libkineto/src/output_json.h @@ -41,14 +41,7 @@ class ChromeTraceLogger : public libkineto::ActivityLogger { void handleTraceSpan(const TraceSpan& span) override; - void handleIterationStart(const TraceSpan& span) override; - - void handleCpuActivity( - const libkineto::GenericTraceActivity& activity, - const TraceSpan& span) override; - - void handleGenericActivity( - const GenericTraceActivity& activity) override; + void handleGenericActivity(const GenericTraceActivity& activity) override; #ifdef HAS_CUPTI void handleRuntimeActivity( @@ -80,6 +73,8 @@ class ChromeTraceLogger : public libkineto::ActivityLogger { void handleLinkEnd(const TraceActivity& e); #endif // HAS_CUPTI + void addIterationMarker(const TraceSpan& span); + void openTraceFile(); void handleGenericInstantEvent(const GenericTraceActivity& op); diff --git a/libkineto/src/output_membuf.h b/libkineto/src/output_membuf.h index 541c0c9df..5c7ed36cd 100644 --- a/libkineto/src/output_membuf.h +++ b/libkineto/src/output_membuf.h @@ -47,22 +47,10 @@ class MemoryTraceLogger : public ActivityLogger { } void handleTraceSpan(const TraceSpan& span) override { - traceSpanList_.push_back(span); + // Handled separately } - void handleIterationStart(const TraceSpan& span) override { - iterationList_.push_back(span); - } - - void handleCpuActivity( - const libkineto::GenericTraceActivity& activity, - const TraceSpan& span) override { - activities_.push_back( - std::make_unique(activity, span)); - } - - void handleGenericActivity( - const GenericTraceActivity& activity) override { + void handleGenericActivity(const GenericTraceActivity& activity) override { activities_.push_back( std::make_unique(activity)); } @@ -115,11 +103,8 @@ class MemoryTraceLogger : public ActivityLogger { for (auto& p : threadInfoList_) { logger.handleThreadInfo(p.first, p.second); } - for (auto& span : traceSpanList_) { - logger.handleTraceSpan(span); - } - for (auto& it : iterationList_) { - logger.handleIterationStart(it); + for (auto& cpu_trace_buffer : buffers_->cpu) { + logger.handleTraceSpan(cpu_trace_buffer->span); } // Hold on to the buffers logger.finalizeTrace(*config_, nullptr, endTime_); @@ -127,35 +112,11 @@ class MemoryTraceLogger : public ActivityLogger { private: - struct CpuActivityDecorator : public libkineto::TraceActivity { - CpuActivityDecorator( - const libkineto::GenericTraceActivity& activity, - const TraceSpan& span) - : wrappee_(activity), span_(span) {} - int64_t deviceId() const override {return wrappee_.deviceId();} - int64_t resourceId() const override {return wrappee_.resourceId();} - int64_t timestamp() const override {return wrappee_.timestamp();} - int64_t duration() const override {return wrappee_.duration();} - int64_t correlationId() const override {return wrappee_.correlationId();} - ActivityType type() const override {return wrappee_.type();} - const std::string name() const override {return wrappee_.name();} - const TraceActivity* linkedActivity() const override { - return wrappee_.linkedActivity(); - } - void log(ActivityLogger& logger) const override { - logger.handleCpuActivity(wrappee_, span_); - } - const libkineto::GenericTraceActivity& wrappee_; - const TraceSpan span_; - }; - std::unique_ptr config_; // Optimization: Remove unique_ptr by keeping separate vector per type std::vector> activities_; std::vector> processInfoList_; std::vector> threadInfoList_; - std::vector traceSpanList_; - std::vector iterationList_; std::unique_ptr buffers_; std::unordered_map metadata_; int64_t endTime_{0}; diff --git a/libkineto/test/ActivityProfilerTest.cpp b/libkineto/test/ActivityProfilerTest.cpp index 4bab8d679..1ea8608d8 100644 --- a/libkineto/test/ActivityProfilerTest.cpp +++ b/libkineto/test/ActivityProfilerTest.cpp @@ -38,19 +38,16 @@ using namespace KINETO_NAMESPACE; // Provides ability to easily create a few test CPU-side ops struct MockCpuActivityBuffer : public CpuTraceBuffer { MockCpuActivityBuffer(int64_t startTime, int64_t endTime) { - span = {startTime, endTime, 0, 1, "Test trace", ""}; + span = TraceSpan(startTime, endTime,"Test trace"); gpuOpCount = 0; } void addOp(std::string name, int64_t startTime, int64_t endTime, int64_t correlation) { - GenericTraceActivity op; - op.activityName = name; - op.activityType = ActivityType::CPU_OP; + GenericTraceActivity op(span, ActivityType::CPU_OP, name); op.startTime = startTime; op.endTime = endTime; - op.device = 0; - op.sysThreadId = systemThreadId(); - op.correlation = correlation; + op.resource = systemThreadId(); + op.id = correlation; activities.push_back(std::move(op)); span.opCount++; } @@ -164,6 +161,7 @@ class ActivityProfilerTest : public ::testing::Test { profiler_ = std::make_unique( cuptiActivities_, /*cpu only*/ false); cfg_ = std::make_unique(); + cfg_->validate(); loggerFactory.addProtocol("file", [](const std::string& url) { return std::unique_ptr(new ChromeTraceLogger(url)); }); diff --git a/libkineto/test/ConfigTest.cpp b/libkineto/test/ConfigTest.cpp index 77b6da367..7c325f14b 100644 --- a/libkineto/test/ConfigTest.cpp +++ b/libkineto/test/ConfigTest.cpp @@ -72,13 +72,9 @@ TEST(ParseTest, Format) { TEST(ParseTest, DefaultActivityTypes) { Config cfg; cfg.validate(); + auto all_activities = activityTypes(); EXPECT_EQ(cfg.selectedActivityTypes(), - std::set({ActivityType::CPU_OP, - ActivityType::GPU_MEMCPY, - ActivityType::GPU_MEMSET, - ActivityType::CONCURRENT_KERNEL, - ActivityType::EXTERNAL_CORRELATION, - ActivityType::CUDA_RUNTIME})); + std::set(all_activities.begin(), all_activities.end())); } TEST(ParseTest, ActivityTypes) { @@ -89,6 +85,9 @@ TEST(ParseTest, ActivityTypes) { EXPECT_EQ(cfg.selectedActivityTypes(), std::set({ActivityType::CPU_OP, + ActivityType::CPU_INSTANT_EVENT, + ActivityType::USER_ANNOTATION, + ActivityType::GPU_USER_ANNOTATION, ActivityType::GPU_MEMCPY, ActivityType::GPU_MEMSET, ActivityType::CONCURRENT_KERNEL, @@ -96,7 +95,7 @@ TEST(ParseTest, ActivityTypes) { ActivityType::CUDA_RUNTIME})); Config cfg2; - EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES=gpu_memcpy,gpu_MeMsEt,concurrent_kernel")); + EXPECT_TRUE(cfg2.parse("ACTIVITY_TYPES=gpu_memcpy,gpu_MeMsEt,kernel")); EXPECT_EQ(cfg2.selectedActivityTypes(), std::set({ActivityType::GPU_MEMCPY, ActivityType::GPU_MEMSET,