diff --git a/lib/Backends/NNPI/NNPIDeviceManager.cpp b/lib/Backends/NNPI/NNPIDeviceManager.cpp index 91cdbc30d6..4922f12b96 100644 --- a/lib/Backends/NNPI/NNPIDeviceManager.cpp +++ b/lib/Backends/NNPI/NNPIDeviceManager.cpp @@ -324,7 +324,8 @@ Error NNPIDeviceManager::startDeviceTrace(TraceContext *traceContext) { traceContext, device_, true /* Software traces are always enabled. */, deviceOptions_->hardwareTraces, deviceOptions_->softwareTracesMaxBuffer, - deviceOptions_->hardwareTracesMaxBuffer)) { + deviceOptions_->hardwareTracesMaxBuffer, + deviceOptions_->rawTracesDumpPath)) { return MAKE_ERR("Failed to start NNPI device trace."); } return Error::success(); diff --git a/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp b/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp index 10b4f6d544..1da6ffdc95 100644 --- a/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp +++ b/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp @@ -28,6 +28,16 @@ #include #include +/// Dump event entry to stream. +static void dumpTraceEntry(const NNPITraceEntry &entry, + std::ostream &outstream) { + outstream << "HTime:" << entry.hostTime << " ETime:" << entry.engineTime; + for (const auto ¶m : entry.params) { + outstream << " " << param.first << ":" << param.second; + } + outstream << std::endl; +} + /// Set default Hardware buffer size to 3GB. #define MAX_HW_TRACE_BUFFER_SIZE_MB (1024 * 3) /// Set default Software buffer size to 400MB. @@ -58,20 +68,21 @@ static eIceCapsSwTraceEvent swEventTypes[] = { eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_USER_DATA}; NNPITraceContext::NNPITraceContext(unsigned devID) - : capsSession_(0), devID_(devID), devIDSet_(false) {} + : capsSession_(0), devID_(devID), devIDSet_(false), dumpRawEventsPath_() {} NNPITraceContext::~NNPITraceContext() { destroyInternalContext(); } bool NNPITraceContext::startCapture(NNPIDeviceContext deviceContext, bool swTraces, bool hwTraces, uint32_t softwareBufferSizeMB, - uint32_t hardwareBufferSizeMB) { + uint32_t hardwareBufferSizeMB, + const std::string &dumpRawEventsPath) { if (!createInternalContext(swTraces, hwTraces, softwareBufferSizeMB, hardwareBufferSizeMB)) { LOG(WARNING) << "nnpi_trace: Failed to create trace device context."; return false; } - + dumpRawEventsPath_ = dumpRawEventsPath; nnpimlStatus mlStatus = nnpiIceCapsStart(capsSession_); if (mlStatus != NNPIML_SUCCESS) { LOG(WARNING) << "nnpi_trace: Failed to start trace, err=" << mlStatus; @@ -127,6 +138,14 @@ bool NNPITraceContext::readTraceOutput() { return false; } + std::ofstream outputFile; + if (!dumpRawEventsPath_.empty()) { + outputFile.open(dumpRawEventsPath_); + if (!outputFile.is_open()) { + LOG(WARNING) << "Failed to open fail for raw events dump"; + } + } + bool started = false; uint64_t glowStart = 0; uint64_t glowEnd = 0; @@ -149,6 +168,10 @@ bool NNPITraceContext::readTraceOutput() { // Set parameters. if (entry.event_name) { traceEntry.params["name"] = std::string(entry.event_name); + if (traceEntry.params["name"] == "MASTER_FIRMWARE") { + // Skip MASTER_FIRMWARE events. + continue; + } } else { traceEntry.params["name"] = "NA"; } @@ -198,9 +221,20 @@ bool NNPITraceContext::readTraceOutput() { params << param.name << ":" << param.value << ", "; } + traceEntry.params["org_state"] = state; if (state == "created" || state == "queued" || state == "req" || - state == "add") { + state == "cbnwc" || state == "po" || state == "a" || + state == "destroyed") { state = "q"; + } else if (traceEntry.params["name"] == "icedrvEventGeneration") { + // Exception for Event Generation. + if (state == "s") { + state = "q"; + } else if (state == "add" || state == "start") { + state = "s"; + } else if (state == "c") { + state = "c"; + } } else if (state == "executed" || state == "cbs" || state == "start") { state = "s"; } else if (state == "completed" || state == "cbc") { @@ -208,6 +242,9 @@ bool NNPITraceContext::readTraceOutput() { } traceEntry.params["state"] = state; entries_.push_back(traceEntry); + if (outputFile.is_open()) { + dumpTraceEntry(traceEntry, outputFile); + } if (traceEntry.params["name"] == "user_data") { if (!started && traceEntry.params["key"] == "BG") { glowStart = std::stol(traceEntry.params["user_data"]); diff --git a/lib/Backends/NNPI/NNPIMLTraceWrapper.h b/lib/Backends/NNPI/NNPIMLTraceWrapper.h index fdc01d8aa5..558153ee9b 100644 --- a/lib/Backends/NNPI/NNPIMLTraceWrapper.h +++ b/lib/Backends/NNPI/NNPIMLTraceWrapper.h @@ -35,7 +35,8 @@ class NNPITraceContext { /// Start capturing traces from the HW device. bool startCapture(NNPIDeviceContext deviceContext, bool swTraces, bool hwTraces, uint32_t softwareBufferSizeMB, - uint32_t hardwareBufferSizeMB); + uint32_t hardwareBufferSizeMB, + const std::string &dumpRawEventsPath); /// Start capturing. bool stopCapture(NNPIDeviceContext deviceContext) const; /// Load traces (valid only after stopCapture()). @@ -64,6 +65,7 @@ class NNPITraceContext { unsigned devID_{0}; bool devIDSet_{false}; std::vector entries_; + std::string dumpRawEventsPath_; }; #endif // NNPI_NNPITRACING_ML_WRAPPER_H diff --git a/lib/Backends/NNPI/NNPIOptions.h b/lib/Backends/NNPI/NNPIOptions.h index 2a83341d2f..ca427ecb6d 100644 --- a/lib/Backends/NNPI/NNPIOptions.h +++ b/lib/Backends/NNPI/NNPIOptions.h @@ -400,6 +400,12 @@ class NNPIDeviceOptions : public NNPIOptions { "Set the max internal buffer size for device hardware traces." "Enabled only when hardwareTraces=1 (use 0 for hard coded default).", "NNPI_HW_TRACES_BUFFER_SIZE", "0"); + /// Path to dump raw trace events from NNP-I. + DECLARE_NNPI_OPTION( + rawTracesDumpPath, std::string, "RawTracesDumpPath", + "Set a path (including a file name) to dump raw device events into a " + "file. If empty, raw events are not dumped.", + "NNPI_DEVICE_TRACES_DUMP_PATH", ""); /// Override the max NNPI device memory. DECLARE_NNPI_OPTION( deviceMemory, unsigned, "DeviceMemory", @@ -451,6 +457,7 @@ class NNPIDeviceOptions : public NNPIOptions { INIT_NNPI_OPTIONS(hardwareTraces, parameters); INIT_NNPI_OPTIONS(softwareTracesMaxBuffer, parameters); INIT_NNPI_OPTIONS(hardwareTracesMaxBuffer, parameters); + INIT_NNPI_OPTIONS(rawTracesDumpPath, parameters); INIT_NNPI_OPTIONS(deviceMemory, parameters); INIT_NNPI_OPTIONS(dumpIOtoFiles, parameters); INIT_NNPI_OPTIONS(avxType, parameters); diff --git a/lib/Backends/NNPI/NNPITracing.cpp b/lib/Backends/NNPI/NNPITracing.cpp index c7b5a83392..9eb3cae133 100644 --- a/lib/Backends/NNPI/NNPITracing.cpp +++ b/lib/Backends/NNPI/NNPITracing.cpp @@ -32,7 +32,8 @@ NNPIDeviceTracing::NNPIDeviceTracing(unsigned deviceId) : deviceId_(deviceId) { bool NNPIDeviceTracing::start(TraceContext *traceContext, NNPIDeviceContext deviceContext, bool swTraces, bool hwTraces, uint32_t softwareBufferSizeMB, - uint32_t hardwareBufferSizeMB) { + uint32_t hardwareBufferSizeMB, + const std::string &dumpRawEventsPath) { if (!traceContext || !traceContext->shouldLog(TraceEvent::TraceLevel::OPERATOR)) { return false; @@ -43,7 +44,8 @@ bool NNPIDeviceTracing::start(TraceContext *traceContext, } bool isFirstToStart = NNPIDeviceTracing::isFirstToChangeCaptureStart(true); if (!traceCtx_->startCapture(deviceContext, swTraces, hwTraces, - softwareBufferSizeMB, hardwareBufferSizeMB)) { + softwareBufferSizeMB, hardwareBufferSizeMB, + dumpRawEventsPath)) { LOG(WARNING) << "Failed to start trace capture for device " << deviceId_ << " is first = " << (isFirstToStart); return false; @@ -73,9 +75,15 @@ std::string NNPIDeviceTracing::getEntryName(NNPITraceEntry &entry) { } } auto params = entry.params; + if (entry.params.count("context_id") > 0) { + name << " CTX 0x" << std::hex << std::stol(entry.params["context_id"]); + } if (entry.params.count("ice_id") > 0) { name << " ICE_" << entry.params["ice_id"]; } + if (entry.params.count("core_id") > 0) { + name << " CORE_" << entry.params["core_id"]; + } if (entry.params.count("network_id") > 0) { name << " Net " << entry.params["network_id"]; } @@ -83,9 +91,6 @@ std::string NNPIDeviceTracing::getEntryName(NNPITraceEntry &entry) { entry.params["network_name"] != "NA") { name << " NetName " << entry.params["network_name"]; } - if (entry.params.count("context_id") > 0) { - name << " CTX 0x" << std::hex << std::stol(entry.params["context_id"]); - } if (entry.params.count("subNetId") > 0) { name << " Subnet " << entry.params["subNetId"]; } @@ -134,7 +139,9 @@ int NNPIDeviceTracing::getAffinityID(NNPITraceEntry &entry, std::string name, } // Use the op name. affinityNameStuct << " " << name.substr(0, name.find(' ')); - if (entry.params.count("state") && entry.params["state"] == "q") { + if (entry.params.count("org_state") && + (entry.params["org_state"] == "q" || + entry.params["org_state"] == "queued")) { affinityNameStuct << " Queue"; } @@ -165,6 +172,11 @@ bool NNPIDeviceTracing::addTrace( if (entry.params.count("state") <= 0) { return false; } + + std::string eventKey = name; + if (entry.params.count("event_key") > 0) { + eventKey += (std::string(" : ") + entry.params["event_key"]); + } std::string state = entry.params["state"]; // Calculate affinity - use the trace thread id to make sections in the @@ -177,30 +189,38 @@ bool NNPIDeviceTracing::addTrace( } // Add events. if (state == "q") { - name += "-Queue"; traceContext->logTraceEvent(name, TraceLevel::OPERATOR, TraceEvent::InstantType, entry.hostTime, entry.params, affinId); - } else if (state == "s" && inflight.count(name) <= 0) { - inflight[name] = entry; - } else if (state == "c" && inflight.count(name) > 0) { + } else if (state == "s" && inflight.count(eventKey) <= 0) { + inflight[eventKey] = entry; + } else if (state == "c" && inflight.count(eventKey) > 0) { // Add only complate events. - if (entry.hostTime >= inflight[name].hostTime) { + if (entry.hostTime >= inflight[eventKey].hostTime) { traceContext->logTraceEvent( name, TraceLevel::OPERATOR, TraceEvent::BeginType, - inflight[name].hostTime, inflight[name].params, affinId); + inflight[eventKey].hostTime, inflight[eventKey].params, affinId); traceContext->logTraceEvent(name, TraceLevel::OPERATOR, TraceEvent::EndType, entry.hostTime, entry.params, affinId); } else { - LOG(WARNING) << "Found incomplete trace event " << name << ": start time " - << inflight[name].hostTime << " end time " << entry.hostTime; + LOG(WARNING) << "[INCOMPLETE EVENT] Found incomplete trace event " + << eventKey << ": start time " << inflight[eventKey].hostTime + << " end time " << entry.hostTime; } - inflight.erase(name); + inflight.erase(eventKey); } else if (state == "po") { traceContext->logTraceEvent(name, TraceLevel::OPERATOR, TraceEvent::InstantType, entry.hostTime, entry.params, affinId); + } else if (entry.params.count("engine") > 0 && + entry.params["engine"] != "HW") { + // Notifies only software events that are incomplete since HW events are + // much more likely to be lost. + LOG(WARNING) << "[INCOMPLETE EVENT] " + << " event key:" << eventKey << " state:" << state + << " inflight: " << (inflight.count(eventKey) > 0) + << " time: " << entry.hostTime; } return true; @@ -228,6 +248,19 @@ bool NNPIDeviceTracing::stopAndUpdate(TraceContext *traceContext, for (auto entry : traceCtx_->getEntries()) { addTrace(entry, inflight, traceContext); } + if (inflight.size() > 0) { + LOG(WARNING) << "[INCOMPLETE EVENT] " << inflight.size() + << " events not logged (still in flight/incomplate)"; + for (const auto &event : inflight) { + if (event.second.params.at("engine") != "HW") { + // Notifies only software events that are incomplete since HW events are + // much more likely to be lost. + LOG(WARNING) << "[INCOMPLETE EVENT] " << event.first << " " + << event.second.params.at("name") + << " state:" << event.second.params.at("state"); + } + } + } started_.clear(); return true; } diff --git a/lib/Backends/NNPI/NNPITracing.h b/lib/Backends/NNPI/NNPITracing.h index 3e24227443..8d4afc9dc4 100644 --- a/lib/Backends/NNPI/NNPITracing.h +++ b/lib/Backends/NNPI/NNPITracing.h @@ -63,7 +63,8 @@ class NNPIDeviceTracing { /// Start recording events. bool start(TraceContext *traceContext, NNPIDeviceContext deviceContext, bool swTraces, bool hwTraces, uint32_t softwareBufferSizeMB, - uint32_t hardwareBufferSizeMB); + uint32_t hardwareBufferSizeMB, + const std::string &dumpRawEventsPath); /// Stop recording, read and update trace context. bool stopAndUpdate(TraceContext *traceContext, NNPIDeviceContext deviceContext); diff --git a/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp b/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp index a2ed0ef583..1d1e92f00d 100644 --- a/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp +++ b/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp @@ -96,8 +96,6 @@ struct BlacklistInitializer { {"LayerNorm_BFloat16/0", TestBlacklist::AnyDeviceHWEngine}, {"LengthsSum/0", TestBlacklist::AnyDeviceAnyEngine}, {"LengthsToRanges/0", TestBlacklist::AnyDeviceAnyEngine}, - {"ModuloInt32NoSignFollow/0", TestBlacklist::AnyDeviceAnyEngine}, - {"ModuloInt32SignFollow/0", TestBlacklist::AnyDeviceAnyEngine}, {"ModuloInt64NoSignFollow/0", TestBlacklist::AnyDeviceAnyEngine}, {"ModuloInt64SignFollow/0", TestBlacklist::AnyDeviceAnyEngine}, {"mul_int32/0", TestBlacklist::AnyDeviceAnyEngine}, @@ -191,7 +189,6 @@ struct BlacklistInitializer { {"ArithMul_int64_t/0", TestBlacklist::AnyDeviceAnyEngine}, {"ArithSub_int32_t/0", TestBlacklist::AnyDeviceHWEngine}, {"ArithSub_int64_t/0", TestBlacklist::AnyDeviceHWEngine}, - {"BatchedGather/0", TestBlacklist::AnyDeviceHWEngine}, {"BatchNorm_Float/0", TestBlacklist::AnyDeviceHWEngine}, {"batchedReduceMin_Int32/0", TestBlacklist::AnyDeviceHWEngine}, {"batchedReduceMin_Int64/0", TestBlacklist::AnyDeviceHWEngine}, @@ -278,8 +275,6 @@ struct BlacklistInitializer { {"convTransposeConvolutionCompareSimpleK5S1P2I3/0", TestBlacklist::AnyDeviceAnyEngine}, {"ChannelwiseQuantizedConv3D/0", TestBlacklist::AnyDeviceAnyEngine}, - {"ChannelwiseQuantizedConv2D_NonZero_FloatBias/0", - TestBlacklist::AnyDeviceSWEngine}, {"ChannelwiseQuantizedConv2D_NonZero_QuantizedBias/0", TestBlacklist::AnyDeviceAnyEngine}, {"ChannelwiseQuantizedConv2D_Int8_BiasInt8_TFT/0", @@ -321,14 +316,10 @@ struct BlacklistInitializer { {"And/0", TestBlacklist::AnyDeviceAnyEngine}, {"Ceil_FloatTy/0", TestBlacklist::AnyDeviceAnyEngine}, {"Ceil_Int8QTy/0", TestBlacklist::AnyDeviceAnyEngine}, - {"CmpGTE_FloatTy/0", TestBlacklist::AnyDeviceAnyEngine}, {"CmpGTE_Int32ITy/0", TestBlacklist::AnyDeviceAnyEngine}, {"CmpGTE_Int64ITy/0", TestBlacklist::AnyDeviceAnyEngine}, - {"CmpGTE_Int8QTy/0", TestBlacklist::AnyDeviceAnyEngine}, - {"CmpGT_FloatTy/0", TestBlacklist::AnyDeviceAnyEngine}, {"CmpGT_Int32ITy/0", TestBlacklist::AnyDeviceAnyEngine}, {"CmpGT_Int64ITy/0", TestBlacklist::AnyDeviceAnyEngine}, - {"CmpGT_Int8QTy/0", TestBlacklist::AnyDeviceAnyEngine}, {"CmpNEQ_FloatTy/0", TestBlacklist::AnyDeviceAnyEngine}, {"CmpNEQ_Int32ITy/0", TestBlacklist::AnyDeviceAnyEngine}, {"CmpNEQ_Int64ITy/0", TestBlacklist::AnyDeviceAnyEngine},