Skip to content

Commit

Permalink
Safely teardown CUPTI (#686)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #686

The cuptiFinalize() call is dangerous as the calling thread will race with other  threads calling CUPTI APIs. This resulted in a SIGSEGV during cuptiActivityFlushAll. This patch will teardown CUPTI appropriately after a profiling session via the CUPTI API exit callsite.
- Teardown at the end of processing to ensure the Activity Buffers are forcefully flushed, and buffers have transferred ownership.
- Uses the recommendation from CUPTI on teardown through callback APIs' exit callsite: https://docs.nvidia.com/cupti/r_main.html#r_dynamic_detach
  - Using mutex and cond var to synchronize finalize for runtime and driver domains.
  - Follows the flow: flush forcefully, all runtime domain apis hits callback, cuptiFinalize is called, cv notifies finalize is complete.
  - Teardown is on a separate thread because PyTorch Profiler forces CUPTI Runtime and Driver calls to be synchronous. Therefore we need a teardown thread to wait on the CUPTI callback api's cuptiFinalize.
- Added tracingEnabled flag to early return if already enabled or disabled.
- Renamed clearActivities to clearCuptiActivities for readability and align with existing API names. This is used to flush buffers after warmup.
- Added logs for CuptiActivityApi to monitor Cupti teardown start / end.

The CuptiActivityProfiler will follow this flow:
- Waiting for request
- Request comes in, enables cupti activities (attaches to cupti), starts warmup
- When warmup is over, clear (ie. flush) cupti activities
- Collection starts and finishes
- In Post Processing, activity buffers are processed (force flush here)
- After Post Processing, trace data reset will trigger cupti flush and finalize. (effectively detaching from cupti)

Test Plan: CI Tests

Reviewed By: chaekit

Differential Revision: D40964557

Pulled By: aaronenyeshi

fbshipit-source-id: 066ccddecdf31fc537716450f97dbe1ea3d0c7a1
  • Loading branch information
aaronenyeshi authored and facebook-github-bot committed Nov 15, 2022
1 parent 14b9e08 commit c7c76c3
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 15 deletions.
61 changes: 56 additions & 5 deletions libkineto/src/CuptiActivityApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <assert.h>
#include <chrono>
#include <mutex>

#include "cupti_call.h"
#include "Logger.h"
Expand Down Expand Up @@ -108,7 +109,6 @@ void CuptiActivityApi::forceLoadCupti() {
#endif
}


void CuptiActivityApi::preConfigureCUPTI() {
#ifdef HAS_CUPTI
if (!isGpuAvailable()) {
Expand Down Expand Up @@ -206,7 +206,7 @@ const std::pair<int, int> CuptiActivityApi::processActivities(
return res;
}

void CuptiActivityApi::clearActivities() {
void CuptiActivityApi::clearCuptiActivities() {
{
std::lock_guard<std::mutex> guard(mutex_);
if (allocatedGpuTraceBuffers_.empty()) {
Expand Down Expand Up @@ -278,8 +278,8 @@ void CuptiActivityApi::enableCuptiActivities(
#ifdef HAS_CUPTI
static bool registered = false;
if (!registered) {
CUPTI_CALL(
cuptiActivityRegisterCallbacks(bufferRequestedTrampoline, bufferCompletedTrampoline));
CUPTI_CALL(
cuptiActivityRegisterCallbacks(bufferRequestedTrampoline, bufferCompletedTrampoline));
}

externalCorrelationEnabled_ = false;
Expand All @@ -304,6 +304,8 @@ void CuptiActivityApi::enableCuptiActivities(
CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
}
}

tracingEnabled_ = 1;
#endif

// Explicitly enabled, so reset this flag if set
Expand Down Expand Up @@ -334,9 +336,58 @@ void CuptiActivityApi::disableCuptiActivities(
}
}
externalCorrelationEnabled_ = false;
#endif
}

void CuptiActivityApi::teardownCuptiContext() {
#ifdef HAS_CUPTI
if (!tracingEnabled_) {
return;
}
if (getenv("TEARDOWN_CUPTI") != nullptr) {
CUPTI_CALL(cuptiFinalize());
LOG(INFO) << "teardownCupti starting";

// PyTorch Profiler is synchronous, so teardown needs to be run async in this thread.
std::thread teardownThread([&] {
auto cbapi_ = CuptiCallbackApi::singleton();
if (!cbapi_->initSuccess()) {
cbapi_->initCallbackApi();
if (!cbapi_->initSuccess()) {
LOG(WARNING) << "CUPTI Callback failed to init, skipping teardown";
return;
}
}
// Subscribe callbacks to call cuptiFinalize in the exit callback of these APIs
bool status = cbapi_->enableCallbackDomain(CUPTI_CB_DOMAIN_RUNTIME_API);
status = status && cbapi_->enableCallbackDomain(CUPTI_CB_DOMAIN_DRIVER_API);
if (!status) {
LOG(WARNING) << "CUPTI Callback failed to enable for domain, skipping teardown";
return;
}

// Force Flush before finalize
CUPTI_CALL(cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));

teardownCupti_ = 1;
std::unique_lock lck(finalizeMutex_);
finalizeCond_.wait(lck, [&]{return teardownCupti_ == 0;});
lck.unlock();
LOG(INFO) << "teardownCupti complete";

teardownCupti_ = 0;
tracingEnabled_ = 0;

// Re-enable callbacks from the past.
cbapi_->initCallbackApi();
cbapi_->reenableCallbacks();
status = cbapi_->disableCallbackDomain(CUPTI_CB_DOMAIN_RUNTIME_API);
status = status && cbapi_->disableCallbackDomain(CUPTI_CB_DOMAIN_DRIVER_API);
if (!status) {
LOG(WARNING) << "CUPTI Callback failed to disable for domain";
}
cbapi_.reset();
});
teardownThread.detach();
}
#endif
}
Expand Down
24 changes: 17 additions & 7 deletions libkineto/src/CuptiActivityApi.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#pragma once

#include <atomic>
#include <condition_variable>
#include <functional>
#include <list>
#include <memory>
Expand All @@ -19,8 +20,11 @@
#include <cupti.h>
#endif

// TODO(T90238193)
// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
#include "ActivityType.h"
#include "CuptiActivityBuffer.h"
#include "CuptiCallbackApi.h"


namespace KINETO_NAMESPACE {
Expand All @@ -37,6 +41,10 @@ class CuptiActivityApi {
Default,
User
};
// Control Variables shared with CuptiCallbackApi for teardown
std::atomic<uint32_t> teardownCupti_{0};
std::mutex finalizeMutex_;
std::condition_variable finalizeCond_;

CuptiActivityApi() = default;
CuptiActivityApi(const CuptiActivityApi&) = delete;
Expand All @@ -53,7 +61,8 @@ class CuptiActivityApi {
const std::set<ActivityType>& selected_activities);
void disableCuptiActivities(
const std::set<ActivityType>& selected_activities);
void clearActivities();
void clearCuptiActivities();
void teardownCuptiContext();

virtual std::unique_ptr<CuptiActivityBufferMap> activityBuffers();

Expand All @@ -74,6 +83,13 @@ class CuptiActivityApi {
static void preConfigureCUPTI();

private:
int maxGpuBufferCount_{0};
CuptiActivityBufferMap allocatedGpuTraceBuffers_;
std::unique_ptr<CuptiActivityBufferMap> readyGpuTraceBuffers_;
std::mutex mutex_;
std::atomic<uint32_t> tracingEnabled_{0};
bool externalCorrelationEnabled_{false};

#ifdef HAS_CUPTI
int processActivitiesForBuffer(
uint8_t* buf,
Expand All @@ -89,12 +105,6 @@ class CuptiActivityApi {
size_t validSize);
#endif // HAS_CUPTI

int maxGpuBufferCount_{0};
CuptiActivityBufferMap allocatedGpuTraceBuffers_;
std::unique_ptr<CuptiActivityBufferMap> readyGpuTraceBuffers_;
std::mutex mutex_;
bool externalCorrelationEnabled_{false};

protected:
#ifdef HAS_CUPTI
void bufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords);
Expand Down
5 changes: 3 additions & 2 deletions libkineto/src/CuptiActivityProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -759,7 +759,7 @@ const time_point<system_clock> CuptiActivityProfiler::performRunLoopStep(
if (!cpuOnly_ && currentIter < 0 &&
(derivedConfig_->isProfilingByIteration() ||
nextWakeupTime < derivedConfig_->profileStartTime())) {
cupti_.clearActivities();
cupti_.clearCuptiActivities();
}

if (cupti_.stopCollection) {
Expand Down Expand Up @@ -931,7 +931,8 @@ void CuptiActivityProfiler::finalizeTrace(const Config& config, ActivityLogger&
void CuptiActivityProfiler::resetTraceData() {
#if defined(HAS_CUPTI) || defined(HAS_ROCTRACER)
if (!cpuOnly_) {
cupti_.clearActivities();
cupti_.clearCuptiActivities();
cupti_.teardownCuptiContext();
}
#endif // HAS_CUPTI || HAS_ROCTRACER
activityMap_.clear();
Expand Down
14 changes: 14 additions & 0 deletions libkineto/src/CuptiCallbackApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
* LICENSE file in the root directory of this source tree.
*/

// TODO(T90238193)
// @lint-ignore-every CLANGTIDY facebook-hte-RelativeInclude
#include "CuptiCallbackApi.h"
#include "CuptiActivityApi.h"

#include <assert.h>
#include <chrono>
Expand Down Expand Up @@ -97,6 +100,17 @@ void CuptiCallbackApi::__callback_switchboard(
default:
break;
}
// This is required to teardown cupti after profiling to prevent QPS slowdown.
if (CuptiActivityApi::singleton().teardownCupti_) {
if (cbInfo->callbackSite == CUPTI_API_EXIT) {
// Teardown CUPTI calling cuptiFinalize()
CUPTI_CALL(cuptiFinalize());
initSuccess_ = false;
CuptiActivityApi::singleton().teardownCupti_ = 0;
CuptiActivityApi::singleton().finalizeCond_.notify_all();
return;
}
}
break;

case CUPTI_CB_DOMAIN_RESOURCE:
Expand Down
2 changes: 1 addition & 1 deletion libkineto/src/init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ void libkineto_init(bool cpuOnly, bool logOnError) {
domain, CuptiCallbackApi::RESOURCE_CONTEXT_CREATED);
status = status && cbapi->enableCallback(
domain, CuptiCallbackApi::RESOURCE_CONTEXT_DESTROYED);
}
}
}

if (!cbapi->initSuccess() || !status) {
Expand Down

0 comments on commit c7c76c3

Please sign in to comment.