diff --git a/lib/Backends/NNPI/CMakeLists.txt b/lib/Backends/NNPI/CMakeLists.txt index b57cace4c3..95c5297e31 100644 --- a/lib/Backends/NNPI/CMakeLists.txt +++ b/lib/Backends/NNPI/CMakeLists.txt @@ -83,14 +83,14 @@ if(NOT NNPI_INFERENCE_API) message(FATAL_ERROR "nnpi_inference include files not found at ${NNPI_INF_LIB_DIR}") endif() -find_path(NNPI_MG_API nnpiml.h ${NNPI_MG_SEARCH_PATH}) +find_path(NNPI_MG_API nnpi_ice_caps.h ${NNPI_MG_SEARCH_PATH}) if(NOT NNPI_MG_API) - message(FATAL_ERROR "nnpiml include files not found at ${NNPI_MG_API_DIR}") + message(FATAL_ERROR "nnpi_ice_caps include files not found at ${NNPI_MG_API_DIR}") endif() -find_library(NNPI_MG_LIB nnpiml ${NNPI_MG_LIB_SEARCH_PATH}) +find_library(NNPI_MG_LIB nnpi_icecaps ${NNPI_MG_LIB_SEARCH_PATH}) if(NOT NNPI_MG_LIB) - message(FATAL_ERROR "nnpiml library not found at ${NNPI_MG_LIB_SEARCH_PATH}") + message(FATAL_ERROR "nnpi_icecaps library not found at ${NNPI_MG_LIB_SEARCH_PATH}") endif() message(STATUS "[NNPI] NNPI_API_DIR = ${NNPI_API_DIR}") diff --git a/lib/Backends/NNPI/ClassGen/NNPISpecificNodes.h b/lib/Backends/NNPI/ClassGen/NNPISpecificNodes.h index 1ec7c80421..39638f3a6c 100644 --- a/lib/Backends/NNPI/ClassGen/NNPISpecificNodes.h +++ b/lib/Backends/NNPI/ClassGen/NNPISpecificNodes.h @@ -26,6 +26,13 @@ BB.newNode("NNPICustomDSP") .setDocstring("This is an experimental NNPI-specific node representing a " "custom DSP op"); -BB.includeBackendSpecificVerification("glow/NNPISpecificNodesVerification.h"); +BB.newNode("NNPICustomIA") + .addMember(MemberType::VectorNodeValue, "Inputs") + .addResultFromCtorArg() // for now use single output + .addMember(MemberType::String, "KernelName") + .addMember(MemberType::String, "IAPath") + .setDocstring("This is an experimental NNPI-specific node representing a " + "custom IA op"); +BB.includeBackendSpecificVerification("glow/NNPISpecificNodesVerification.h"); #endif // GLOW_WITH_NNPI diff --git a/lib/Backends/NNPI/ClassGen/NNPISpecificNodesVerification.h b/lib/Backends/NNPI/ClassGen/NNPISpecificNodesVerification.h index 31c6442c29..97cf81d1a1 100644 --- a/lib/Backends/NNPI/ClassGen/NNPISpecificNodesVerification.h +++ b/lib/Backends/NNPI/ClassGen/NNPISpecificNodesVerification.h @@ -19,4 +19,8 @@ bool NNPICustomDSPNode::verify() const { return true; // actual verification to happen in the backend } +bool NNPICustomIANode::verify() const { + return true; // actual verification to happen in the backend +} + #endif // GLOW_WITH_NNPI diff --git a/lib/Backends/NNPI/DebugMacros.h b/lib/Backends/NNPI/DebugMacros.h index 9f6988dc36..83c97b1dfe 100644 --- a/lib/Backends/NNPI/DebugMacros.h +++ b/lib/Backends/NNPI/DebugMacros.h @@ -21,6 +21,7 @@ #include "nnpi_transformer.h" #include #include +#include #include // Macro for memory instrumentation. @@ -273,4 +274,14 @@ GetNNPIInferenceErrorDesc(NNPIInferenceErrorCode err) { "\n"; \ } +// Break long log messages to individual lines (Glog limits to 30k chars). +#define LONG_LOG(level, msg) \ + { \ + std::istringstream iss(msg); \ + std::string line; \ + while (std::getline(iss, line)) { \ + LOG(level) << line; \ + } \ + } + #endif // GLOW_NNPI_DEBUG_MACROS_H diff --git a/lib/Backends/NNPI/Importer.cpp b/lib/Backends/NNPI/Importer.cpp index 6b4578d30b..c5b45475bd 100644 --- a/lib/Backends/NNPI/Importer.cpp +++ b/lib/Backends/NNPI/Importer.cpp @@ -23,6 +23,7 @@ #include "nnpi_transformer.h" #include #include +#include #include using namespace glow; @@ -215,7 +216,7 @@ void glow::NNPIImporter::updateDescDimsFromGlow( desc.layout = NNPI_LAYOUT_ANY; break; case 5: - desc.layout = NNPI_LAYOUT_ANY; + desc.layout = alternativeLayout ? NNPI_LAYOUT_NDHWC : NNPI_LAYOUT_ANY; break; case 4: desc.layout = alternativeLayout ? NNPI_LAYOUT_NHWC : NNPI_LAYOUT_ANY; @@ -403,6 +404,7 @@ bool glow::NNPIImporter::isVariableUsingAlternativeLayout(Storage *v) { for (const auto &user : v->getUsers()) { switch (user.getUser()->getKind()) { case Kinded::Kind::ConvolutionNodeKind: + case Kinded::Kind::Convolution3DNodeKind: case Kinded::Kind::AvgPoolNodeKind: case Kinded::Kind::MaxPoolNodeKind: return true; @@ -415,6 +417,17 @@ bool glow::NNPIImporter::isVariableUsingAlternativeLayout(Storage *v) { return false; } +NNPIErrorCode +glow::NNPIImporter::addIAExtentionPath(const std::string &extPath) { + LOG_AND_RETURN_IF(ERROR, extPath.empty(), "Check if empty IA extension path.", + NNPI_INVALID_PARAM); + std::ifstream extensionFile(extPath.c_str()); + LOG_AND_RETURN_IF_NOT(ERROR, extensionFile, "IA extension path not found.", + NNPI_INVALID_RESOURCE_NAME); + iaExtensionPaths_.push_back(extPath); + return NNPI_NO_ERROR; +} + NNPINetwork glow::NNPIImporter::importFunction(Function *F, const BackendOptions &opts) { // Clear internals. @@ -507,33 +520,45 @@ NNPINetwork glow::NNPIImporter::importFunction(Function *F, } // Node Importers //////////////////////////////////////////////////////// +template class ConvolutionNodeImporter : public INNPINodeImporter { public: NNPIErrorCode importNode(Node *n, NNPIImporter &importer) override { - auto *glowConv = llvm::dyn_cast(n); + auto *glowConv = llvm::dyn_cast(n); + + std::string convStr = (convDims == 2) ? "Conv" : "Conv3D"; LOG_AND_RETURN_IF_NOT(ERROR, glowConv, "Bad node type", NNPI_INVALID_PARAM); - const uint32_t SPATIAL_DIMS2 = 2; - LOG_AND_RETURN_IF_NOT(ERROR, glowConv->getKernels().size() == SPATIAL_DIMS2, - "[Conv] Invalid number of kernel sizes", + LOG_AND_RETURN_IF_NOT(ERROR, glowConv->getKernels().size() == convDims, + "[" + convStr + "] Invalid number of kernel sizes", NNPI_INVALID_PARAM); - LOG_AND_RETURN_IF_NOT(ERROR, - glowConv->getPads().size() == 2 * SPATIAL_DIMS2, - "[Conv] Invalid number of pads", NNPI_INVALID_PARAM); - LOG_AND_RETURN_IF_NOT(ERROR, glowConv->getStrides().size() == SPATIAL_DIMS2, - "[Conv] Invalid number of strides", + LOG_AND_RETURN_IF_NOT(ERROR, glowConv->getPads().size() == 2 * convDims, + "[" + convStr + "] Invalid number of pads", + NNPI_INVALID_PARAM); + LOG_AND_RETURN_IF_NOT(ERROR, glowConv->getStrides().size() == convDims, + "[" + convStr + "] Invalid number of strides", NNPI_INVALID_PARAM); - uint32_t kernel[SPATIAL_DIMS2] = {glowConv->getKernels()[0], - glowConv->getKernels()[1]}; - uint32_t paddingStart[SPATIAL_DIMS2] = {glowConv->getPads()[0], - glowConv->getPads()[1]}; - uint32_t paddingEnd[SPATIAL_DIMS2] = {glowConv->getPads()[2], - glowConv->getPads()[3]}; - uint32_t stride[SPATIAL_DIMS2] = {glowConv->getStrides()[0], - glowConv->getStrides()[1]}; - uint32_t dilation[SPATIAL_DIMS2] = {glowConv->getDilation(), - glowConv->getDilation()}; + uint32_t kernel[convDims]; + uint32_t paddingStart[convDims]; + uint32_t paddingEnd[convDims]; + uint32_t stride[convDims]; + uint32_t dilation[convDims]; + + ConvolutionNode *conv2DNode = llvm::dyn_cast(glowConv); + for (size_t i = 0; i < convDims; i++) { + kernel[i] = glowConv->getKernels()[i]; + stride[i] = glowConv->getStrides()[i]; + if (conv2DNode) { + paddingStart[i] = glowConv->getPads()[i]; + paddingEnd[i] = glowConv->getPads()[convDims + i]; + dilation[i] = conv2DNode->getDilation(); + } else { + paddingStart[i] = glowConv->getPads()[i * 2]; + paddingEnd[i] = glowConv->getPads()[i * 2 + 1]; + dilation[i] = 1; + } + } LOG_NNPI_IF_ERROR_RETURN_VALUE( importer.addTensor(nodeValueName(glowConv->getFilter()), @@ -567,7 +592,7 @@ class ConvolutionNodeImporter : public INNPINodeImporter { nodeValueName(glowConv->getFilter()).c_str(), glowConv->getBias() ? nodeValueName(glowConv->getBias()).c_str() : nullptr, - kernel, paddingStart, paddingEnd, stride, dilation, SPATIAL_DIMS2, + kernel, paddingStart, paddingEnd, stride, dilation, convDims, glowConv->getGroup()); } }; @@ -1430,74 +1455,12 @@ class RQFCNodeImporter : public INNPINodeImporter { auto *glowRowwiseFC = llvm::dyn_cast(n); LOG_AND_RETURN_IF_NOT(ERROR, glowRowwiseFC, "Bad node type", NNPI_INVALID_PARAM); - LOG_AND_RETURN_IF_NOT( - ERROR, glowRowwiseFC->getInput().getType()->getOffset() == 0.f, - (std::string("Bad input offset value") + - std::to_string(glowRowwiseFC->getInput().getType()->getOffset())), - NNPI_INVALID_PARAM); - LOG_AND_RETURN_IF_NOT( - ERROR, glowRowwiseFC->getResult().getType()->getOffset() == 0.f, - (std::string("Bad result offset value") + - std::to_string(glowRowwiseFC->getResult().getType()->getOffset())), - NNPI_INVALID_PARAM); LOG_AND_RETURN_IF_NOT( ERROR, !(glowRowwiseFC->getOffsets()) || importer.zeroes(nodeValueName(glowRowwiseFC->getOffsets()).c_str()), "Bad offset value", NNPI_INVALID_PARAM); - // Add internal tensor for Symlowp input. - std::string symlowpInputName = - NNPIImporter::internalName_ + - nodeValueName(glowRowwiseFC->getInput()).c_str() + "_symlowp"; - auto *inType = glowRowwiseFC->getInput().getType(); - LOG_NNPI_IF_ERROR_RETURN_VALUE( - importer.addValue(symlowpInputName, inType, - /* alternativeLayout */ inType->dims().size() == 4, - /* input */ false, /* output */ false, {}, {}, - /* forceSymlowp */ true), - "Failed to add value"); - - // Add internal tensor for Symlowp output. - std::string symlowpOutputName = - NNPIImporter::internalName_ + - nodeValueName(glowRowwiseFC->getResult()).c_str() + "_symlowp"; - auto *outType = glowRowwiseFC->getResult().getType(); - LOG_NNPI_IF_ERROR_RETURN_VALUE( - importer.addValue(symlowpOutputName, outType, - /* alternativeLayout */ outType->dims().size() == 4, - /* input */ false, /* output */ false, {}, {}, - /* forceSymlowp */ true), - "Failed to add value"); - - // Add convert op from Gemmlowp input to Symlowp. - std::string convertInputName = NNPIImporter::internalName_ + - glowRowwiseFC->getName().begin() + - "_convert_input"; - std::string convertInputInputName = - nodeValueName(glowRowwiseFC->getInput()); - if (!importer.hasChannelWiseConverter(convertInputInputName)) { - LOG_NNPI_IF_ERROR_RETURN_VALUE( - nnpiNetworkAddConvertOp( - importer.getNetwork(), convertInputName.c_str(), - convertInputInputName.c_str(), symlowpInputName.c_str()), - "Failed to add layer"); - importer.addChannelWiseConverter(convertInputInputName); - } - - // Add convert op from Symlowp output to Gemmlowp. - std::string convertOutputName = NNPIImporter::internalName_ + - glowRowwiseFC->getName().begin() + - "_convert_output"; - std::string convertOutputOutputName = - nodeValueName(glowRowwiseFC->getResult()); - LOG_NNPI_IF_ERROR_RETURN_VALUE( - nnpiNetworkAddConvertOp( - importer.getNetwork(), convertOutputName.c_str(), - symlowpOutputName.c_str(), convertOutputOutputName.c_str()), - "Failed to add layer"); - importer.addChannelWiseConverter(convertOutputOutputName); - // Create the weights with no offset tensor. // Assert weights & biases have no offset or all zeroes. @@ -1534,17 +1497,14 @@ class RQFCNodeImporter : public INNPINodeImporter { nodeValueName(glowRowwiseFC->getInput()), nodeValueName(glowRowwiseFC->getWeights()), nodeValueName(glowRowwiseFC->getBias()), - symlowpInputName, - symlowpOutputName, }, { nodeValueName(glowRowwiseFC->getResult()), - symlowpInputName, - symlowpOutputName, }); return nnpiNetworkAddFullyConnectedOp( importer.getNetwork(), glowRowwiseFC->getName().begin(), - symlowpInputName.c_str(), symlowpOutputName.c_str(), + nodeValueName(glowRowwiseFC->getInput()).c_str(), + nodeValueName(glowRowwiseFC->getResult()).c_str(), nodeValueName(glowRowwiseFC->getWeights()).c_str(), glowRowwiseFC->getBias() ? nodeValueName(glowRowwiseFC->getBias()).c_str() @@ -1560,7 +1520,6 @@ class ChannelwiseQuantizedConvolutionNodeImporter : public INNPINodeImporter { llvm::dyn_cast(n); LOG_AND_RETURN_IF_NOT(ERROR, glowChannelwiseQuantizedConv, "Bad node type", NNPI_INVALID_PARAM); - LOG_AND_RETURN_IF_NOT( ERROR, !(glowChannelwiseQuantizedConv->getFilterOffsets()) || @@ -1597,60 +1556,6 @@ class ChannelwiseQuantizedConvolutionNodeImporter : public INNPINodeImporter { glowChannelwiseQuantizedConv->getStrides()[1]}; uint32_t dilation[SPATIAL_DIMS2] = {1, 1}; // No dilation, default values - // Add internal tensor for Symlowp input. - std::string symlowpInputName = - NNPIImporter::internalName_ + - nodeValueName(glowChannelwiseQuantizedConv->getInput()).c_str() + - "_symlowp"; - auto *inType = glowChannelwiseQuantizedConv->getInput().getType(); - LOG_NNPI_IF_ERROR_RETURN_VALUE( - importer.addValue(symlowpInputName, inType, - /* alternativeLayout */ inType->dims().size() == 4, - /* input */ false, /* output */ false, {}, {}, - /* forceSymlowp */ true), - "Failed to add value"); - - // Add internal tensor for Symlowp output. - std::string symlowpOutputName = - NNPIImporter::internalName_ + - nodeValueName(glowChannelwiseQuantizedConv->getResult()).c_str() + - "_symlowp"; - auto *outType = glowChannelwiseQuantizedConv->getResult().getType(); - LOG_NNPI_IF_ERROR_RETURN_VALUE( - importer.addValue(symlowpOutputName, outType, - /* alternativeLayout */ outType->dims().size() == 4, - /* input */ false, /* output */ false, {}, {}, - /* forceSymlowp */ true), - "Failed to add value"); - - // Add convert op from Gemmlowp input to Symlowp. - std::string convertInputName = - NNPIImporter::internalName_ + - glowChannelwiseQuantizedConv->getName().begin() + "_convert_input"; - std::string convertInputInputName = - nodeValueName(glowChannelwiseQuantizedConv->getInput()); - if (!importer.hasChannelWiseConverter(convertInputInputName)) { - LOG_NNPI_IF_ERROR_RETURN_VALUE( - nnpiNetworkAddConvertOp( - importer.getNetwork(), convertInputName.c_str(), - convertInputInputName.c_str(), symlowpInputName.c_str()), - "Failed to add layer"); - importer.addChannelWiseConverter(convertInputInputName); - } - - // Add convert op from Symlowp output to Gemmlowp. - std::string convertOutputName = - NNPIImporter::internalName_ + - glowChannelwiseQuantizedConv->getName().begin() + "_convert_output"; - std::string convertOutputOutputName = - nodeValueName(glowChannelwiseQuantizedConv->getResult()); - LOG_NNPI_IF_ERROR_RETURN_VALUE( - nnpiNetworkAddConvertOp( - importer.getNetwork(), convertOutputName.c_str(), - symlowpOutputName.c_str(), convertOutputOutputName.c_str()), - "Failed to add layer"); - importer.addChannelWiseConverter(convertOutputOutputName); - // Create the weights with no offset tensor. // Assert weights & biases have no offset or all zeroes. @@ -1694,18 +1599,15 @@ class ChannelwiseQuantizedConvolutionNodeImporter : public INNPINodeImporter { nodeValueName(glowChannelwiseQuantizedConv->getInput()), nodeValueName(glowChannelwiseQuantizedConv->getFilter()), nodeValueName(glowChannelwiseQuantizedConv->getBias()), - symlowpInputName, - symlowpOutputName, }, { nodeValueName(glowChannelwiseQuantizedConv->getResult()), - symlowpInputName, - symlowpOutputName, }); return nnpiNetworkAddConvolutionOp( importer.getNetwork(), glowChannelwiseQuantizedConv->getName().begin(), - symlowpInputName.c_str(), symlowpOutputName.c_str(), + nodeValueName(glowChannelwiseQuantizedConv->getInput()).c_str(), + nodeValueName(glowChannelwiseQuantizedConv->getResult()).c_str(), nodeValueName(glowChannelwiseQuantizedConv->getFilter()).c_str(), glowChannelwiseQuantizedConv->getBias() ? nodeValueName(glowChannelwiseQuantizedConv->getBias()).c_str() @@ -1956,6 +1858,42 @@ class BatchOneHotNodeImporter : public INNPINodeImporter { } }; +class NNPICustomIANodeImporter : public INNPINodeImporter { +public: + NNPIErrorCode importNode(Node *n, NNPIImporter &importer) override { + auto *glowIA = llvm::dyn_cast(n); + LOG_AND_RETURN_IF_NOT(ERROR, glowIA, "Bad node type", NNPI_INVALID_PARAM); + + auto numInputs = glowIA->getInputs().size(); + NNPIObjectName inputs[numInputs]; + LOG_AND_RETURN_IF_NOT(ERROR, inputs, "No inputs", NNPI_INVALID_PARAM); + std::unordered_set inputTensors; + uint32_t i = 0; + for (const auto &nv : glowIA->getInputs()) { + auto nvName = nodeValueName(nv); + strncpy(inputs[i++], nvName.c_str(), sizeof(NNPIObjectName)); + inputTensors.insert(nvName); + } + + uint32_t numOutputs = 1; + NNPIObjectName outputs[numOutputs]; + LOG_AND_RETURN_IF_NOT(ERROR, outputs, "No outputs", NNPI_INVALID_PARAM); + std::unordered_set outputTensors; + auto nvName = nodeValueName(glowIA->getResult()); + strncpy(outputs[0], nvName.c_str(), sizeof(NNPIObjectName)); + outputTensors.insert(nvName); + + importer.setUsedTensors(inputTensors, outputTensors); + NNPIErrorCode error = importer.addIAExtentionPath(glowIA->getIAPath()); + LOG_AND_RETURN_IF_NOT(ERROR, error == NNPI_NO_ERROR, + "Failed to store IA extension", NNPI_INVALID_PARAM); + + auto res = nnpiNetworkAddCustomIAOp( + importer.getNetwork(), glowIA->getName().begin(), numInputs, inputs, + numOutputs, outputs, glowIA->getKernelName().c_str()); + return res; + } +}; class NNPICustomDSPNodeImporter : public INNPINodeImporter { public: NNPIErrorCode importNode(Node *n, NNPIImporter &importer) override { @@ -2079,7 +2017,10 @@ std::unordered_map< std::string, std::unique_ptr>::value_type importerInit[] = { {"", nullptr}, - {"Convolution", glow::make_unique()}, + {"Convolution", + glow::make_unique>()}, + {"Convolution3D", + glow::make_unique>()}, {"Transpose", glow::make_unique()}, {"MaxPool", glow::make_unique>()}, @@ -2160,6 +2101,7 @@ std::unordered_map< {"LengthsRangeFill", glow::make_unique()}, {"BatchOneHot", glow::make_unique()}, {"NNPICustomDSP", glow::make_unique()}, + {"NNPICustomIA", glow::make_unique()}, {"SpaceToDepth", glow::make_unique()}, {"Clip", glow::make_unique()}, {"BatchNormalization", glow::make_unique()}, diff --git a/lib/Backends/NNPI/Importer.h b/lib/Backends/NNPI/Importer.h index 539b7d8887..712bb1c1c8 100644 --- a/lib/Backends/NNPI/Importer.h +++ b/lib/Backends/NNPI/Importer.h @@ -106,6 +106,16 @@ class NNPIImporter { channelwiseConverters_.emplace(s); } + /// Add a path to AI extension (that will be loaded by the inference API). + /// Will fail if a file does not exist at this path, validity of the file is + /// checked only when the extension is loaded. + NNPIErrorCode addIAExtentionPath(const std::string &extPath); + + /// Get AI extension paths. + const std::vector &getIAExtensionPaths() const { + return iaExtensionPaths_; + } + private: /// Map of named external tensors (inputs, outputs, weights, etc...). std::unordered_map constants_; @@ -135,6 +145,9 @@ class NNPIImporter { /// an input is feeding into more than one channelwise ops. 2. an output of /// one channelwise op is consumed by another channelwise op. std::unordered_set channelwiseConverters_; + + /// A list of IA extensions that need to be loaded by the device. + std::vector iaExtensionPaths_; }; /// Interface class for all node specific importers. diff --git a/lib/Backends/NNPI/InferenceContext.cpp b/lib/Backends/NNPI/InferenceContext.cpp index 6790ebe3ce..68c0d0604c 100644 --- a/lib/Backends/NNPI/InferenceContext.cpp +++ b/lib/Backends/NNPI/InferenceContext.cpp @@ -30,7 +30,7 @@ namespace runtime { InferenceContext::InferenceContext() : nnpiNetwork_(NNPI_INVALID_NNPIHANDLE), device_(NNPI_INVALID_NNPIHANDLE), inferCmd_(NNPI_INVALID_NNPIHANDLE), commandList_(NNPI_INVALID_NNPIHANDLE), - deviceTracing_(nullptr), deviceOptions_(nullptr) {} + deviceOptions_(nullptr) {} InferenceContext::~InferenceContext() { if (deviceOptions_ && deviceOptions_->inferOnDevice) { @@ -52,7 +52,6 @@ bool InferenceContext::init( NNPIDeviceContext device, const std::unordered_set &partialInputs, const std::unordered_set &staticInputs, - std::shared_ptr deviceTracing, StaticPlaceholderMap *staticPlaceholderMap, std::shared_ptr deviceOptions, const std::string &functionName, unsigned deviceId, @@ -63,7 +62,6 @@ bool InferenceContext::init( device_ = device; compilationConfig_ = config; partialInputs_ = &partialInputs; - deviceTracing_ = deviceTracing; functionName_ = functionName; // Initialize trace context titles with device ID. @@ -309,6 +307,8 @@ bool InferenceContext::init( void InferenceContext::execute(RunIdentifierTy runId, std::unique_ptr ctx, runtime::ResultCBTy resultCB) { + std::string traceBackendExecuteStr = + llvm::formatv("{0} {1:x}", traceBackendExecuteContextName_, ctx.get()); std::map attributes; @@ -319,7 +319,7 @@ void InferenceContext::execute(RunIdentifierTy runId, } TRACE_EVENT_SCOPE_NAMED(ctx->getTraceContext(), TraceLevel::REQUEST, - traceBackendExecuteContextName_, traceBlock); + traceBackendExecuteStr, traceBlock); for (const auto &iter : attributes) { traceBlock.addArg(iter.first, iter.second); } @@ -329,9 +329,6 @@ void InferenceContext::execute(RunIdentifierTy runId, llvm::formatv("Inf ctx - device: {0}: {1}", deviceId_, functionName_) .str()); } - if (deviceTracing_) { - deviceTracing_->start(ctx->getTraceContext(), device_); - } // Pre inference input preparation. PlaceholderBindings &bindings = *ctx->getPlaceholderBindings(); @@ -393,15 +390,16 @@ void InferenceContext::execute(RunIdentifierTy runId, } rawInputs.push_back(in->getHostPtr()); } - + std::string inferContext = traceInferenceContextName_; // Inference. if (deviceOptions_->inferOnDevice) { if (deviceOptions_->enabledCommandLists < 1) { // No command lists (schedule individual commands). + inferContext = llvm::formatv("{0} {1:x}", inferContext, inferCmd_); TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY, tracePreProcessContextName_); TRACE_EVENT_BEGIN_ATTR(ctx->getTraceContext(), TraceLevel::OPERATOR, - traceInferenceContextName_, attributes); + inferContext, attributes); // Queue inference. LOG_AND_CALLBACK_EXECUTE_NNPI_INF_IF_ERROR( nnpiInferCommandQueue(inferCmd_, 0), "Failed to queue infer command.", @@ -429,11 +427,11 @@ void InferenceContext::execute(RunIdentifierTy runId, usedConfigs++; } } - + inferContext = llvm::formatv("{0} {1:x}", inferContext, commandList_); TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY, tracePreProcessContextName_); TRACE_EVENT_BEGIN_ATTR(ctx->getTraceContext(), TraceLevel::OPERATOR, - traceInferenceContextName_, attributes); + inferContext, attributes); // Queue Command list LOG_AND_CALLBACK_EXECUTE_NNPI_INF_IF_ERROR( nnpiCommandListQueue(commandList_, &(cmdConfigs_.at(0)), usedConfigs), @@ -482,15 +480,18 @@ void InferenceContext::execute(RunIdentifierTy runId, } else if (!deviceOptions_->useIceT) { // Infer on ice-ref. + // Ice-ref not re-entrant - To be removed once ICE-29869 is implemented + static std::mutex icerefMutex; + std::lock_guard guard(icerefMutex); + for (auto &out : outputResources_) { // Collect output ptrs for ICE-Ref rawOutputs.push_back(out->getHostPtr()); } - TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY, - TRACING_PRE_PROCESS); + tracePreProcessContextName_); TRACE_EVENT_BEGIN_ATTR(ctx->getTraceContext(), TraceLevel::OPERATOR, - TRACING_INFERENCE, attributes); + inferContext, attributes); LOG_AND_CALLBACK_EXECUTE_NNPI_IF_ERROR( nnpiNetworkInferOnHost(nnpiNetwork_, &(rawInputs[0]), rawInputs.size(), &(rawOutputs[0]), rawOutputs.size(), @@ -501,11 +502,9 @@ void InferenceContext::execute(RunIdentifierTy runId, // Nothing else to do here. } - TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::OPERATOR, - traceInferenceContextName_); + TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::OPERATOR, inferContext); TRACE_EVENT_BEGIN_ATTR(ctx->getTraceContext(), TraceLevel::COPY, tracePostProcessContextName_, attributes); - // Post inference output handling. for (unsigned i = 0, e = outputResources_.size(); i < e; ++i) { auto *t = bindings.get(netOutputPlaceholders_[i]); @@ -518,9 +517,6 @@ void InferenceContext::execute(RunIdentifierTy runId, TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY, tracePostProcessContextName_); - if (deviceTracing_) { - deviceTracing_->stopAndUpdate(ctx->getTraceContext(), device_); - } TRACE_EVENT_SCOPE_END_NAMED(traceBlock); // we move context in the line below // Invoke CB. diff --git a/lib/Backends/NNPI/InferenceContext.h b/lib/Backends/NNPI/InferenceContext.h index eae9dee660..a67a3d69de 100644 --- a/lib/Backends/NNPI/InferenceContext.h +++ b/lib/Backends/NNPI/InferenceContext.h @@ -52,9 +52,6 @@ class InferenceContext { /// Set of inputs that are static tensors. std::unordered_set staticInputs_; - /// Device tracing handler. - std::shared_ptr deviceTracing_; - /// NNPI Device configuration. std::shared_ptr deviceOptions_; @@ -95,7 +92,6 @@ class InferenceContext { NNPIDeviceContext device, const std::unordered_set &partialInputs, const std::unordered_set &staticInputs, - std::shared_ptr deviceTracing, StaticPlaceholderMap *staticPlaceholderMap, std::shared_ptr deviceOptions, const std::string &functionName, unsigned deviceId, diff --git a/lib/Backends/NNPI/InferencePool.cpp b/lib/Backends/NNPI/InferencePool.cpp index 4311dc6057..387ecf775b 100644 --- a/lib/Backends/NNPI/InferencePool.cpp +++ b/lib/Backends/NNPI/InferencePool.cpp @@ -27,6 +27,38 @@ namespace glow { namespace runtime { +static bool isEmptyDeviceNetworkConfig(const NNPIDeviceNetworkConfig &cfg) { + if (cfg.disableECC != 0) { + return false; + } + + if (cfg.pnpHints.ringFrequencyPrio != 0.f) { + return false; + } + + const int numIceBO = sizeof(cfg.pnpHints.iceBOFrequencyPrio) / + sizeof(cfg.pnpHints.iceBOFrequencyPrio[0]); + for (int i = 0; i < numIceBO; i++) { + if (cfg.pnpHints.iceBOFrequencyPrio[i] != 0.f) { + return false; + } + } + + const int numIA = sizeof(cfg.pnpHints.IAFrequencyPrio) / + sizeof(cfg.pnpHints.IAFrequencyPrio[0]); + for (unsigned i = 0; i < numIA; i++) { + if (cfg.pnpHints.IAFrequencyPrio[i] != 0.f) { + return false; + } + } + + if (cfg.pnpHints.DDRBandwidth != 0.f) { + return false; + } + + return true; +} + InferencePoolEnv::InferencePoolEnv() : deviceOptions_(nullptr), nnpiCompiledFunction_(nullptr), staticPlaceholderMap_(nullptr) {} @@ -42,7 +74,6 @@ InferencePoolEnv::~InferencePoolEnv() { } Error InferencePoolEnv::init(NNPIAdapter adapter, NNPIDeviceContext device, - std::shared_ptr deviceTracing, CompiledFunction *compiledFunction, StaticPlaceholderMap *staticPlaceholderMap, std::shared_ptr deviceOptions, @@ -64,7 +95,6 @@ Error InferencePoolEnv::init(NNPIAdapter adapter, NNPIDeviceContext device, size_t numWorkers = deviceOptions_->inferOnDevice ? optionsNumWorkers : 1; workersPool_ = glow::make_unique( numWorkers, std::make_shared("NNPI-worker")); - deviceTracing_ = deviceTracing; staticPlaceholderMap_ = staticPlaceholderMap; inferenceContexts_.resize(numWorkers); @@ -76,6 +106,16 @@ Error InferencePoolEnv::init(NNPIAdapter adapter, NNPIDeviceContext device, // Create host network. NNPIHostNetwork hostNetwork(NNPI_INVALID_NNPIHANDLE); if (deviceOptions_->inferOnDevice) { + // Load IA extenstions. + for (auto &extensionPath : nnpiCompiledFunction_->getIAExtensionPaths()) { + NNPIExtension ext; + LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( + nnpiExtensionCreate(extensionPath.c_str(), &ext), + "Failed to create NNPI IA Extension object"); + LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( + nnpiDeviceContextLoadExtension(device, ext), + "Failed to load NNPI IA Extension object"); + } // Create NNPI host network (load compiled binary). auto filename = nnpiCompiledFunction_->getCompilationFilename(); if (filename.empty()) // Create network from memory. @@ -106,9 +146,32 @@ Error InferencePoolEnv::init(NNPIAdapter adapter, NNPIDeviceContext device, } DBG_MEM_USAGE("call nnpiDeviceNetworkCreate"); + NNPIDeviceNetworkConfig cfg = + nnpiCompiledFunction_->getDeviceNetworkConfig(); + NNPIDeviceNetworkConfig *pCfg = nullptr; + if (!isEmptyDeviceNetworkConfig(cfg)) { + pCfg = &cfg; + LOG(INFO) << "DeviceNetwork PnP: " + << "\n"; + LOG(INFO) << " Ring: " << cfg.pnpHints.ringFrequencyPrio << "\n"; + LOG(INFO) << " ICEBO 0: " << cfg.pnpHints.iceBOFrequencyPrio[0] << "\n"; + LOG(INFO) << " ICEBO 1: " << cfg.pnpHints.iceBOFrequencyPrio[1] << "\n"; + LOG(INFO) << " ICEBO 2: " << cfg.pnpHints.iceBOFrequencyPrio[2] << "\n"; + LOG(INFO) << " ICEBO 3: " << cfg.pnpHints.iceBOFrequencyPrio[3] << "\n"; + LOG(INFO) << " ICEBO 4: " << cfg.pnpHints.iceBOFrequencyPrio[4] << "\n"; + LOG(INFO) << " ICEBO 5: " << cfg.pnpHints.iceBOFrequencyPrio[5] << "\n"; + LOG(INFO) << " IA 0: " << cfg.pnpHints.IAFrequencyPrio[0] << "\n"; + LOG(INFO) << " IA 1: " << cfg.pnpHints.IAFrequencyPrio[1] << "\n"; + LOG(INFO) << " DDR: " << cfg.pnpHints.DDRBandwidth << "\n"; + LOG(INFO) + << " Resource reservation: " + << nnpiCompiledFunction_->getCompilationOptions().reserveResources + << "\n"; + } + // Create NNPI device network (deploy to device). LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( - nnpiDeviceNetworkCreate(device, hostNetwork, nullptr, &deviceNetwork_), + nnpiDeviceNetworkCreate(device, hostNetwork, pCfg, &deviceNetwork_), "Failed to create NNPI device network"); DBG_MEM_USAGE("done nnpiDeviceNetworkCreate"); if (nnpiCompiledFunction_->getCompilationOptions().reserveResources) { @@ -187,8 +250,8 @@ Error InferencePoolEnv::init(NNPIAdapter adapter, NNPIDeviceContext device, nnpiCompiledFunction_->getCompiledNetworkHandle(), nnpiCompiledFunction_->getCompilationConfig(), deviceNetwork_, adapter, device, nnpiCompiledFunction_->getPartialInputs(), - nnpiCompiledFunction_->getStaticInputs(), deviceTracing_, - staticPlaceholderMap_, deviceOptions_, functionName_, deviceId_); + nnpiCompiledFunction_->getStaticInputs(), staticPlaceholderMap_, + deviceOptions_, functionName_, deviceId_); if (!success) { return MAKE_ERR("Failed to initialize inferece context"); } @@ -264,14 +327,13 @@ InferencePoolEnv::createDetachedInferenceContext(PlaceholderUsageMap &phUsage) { InferenceContext *infCtx = new InferenceContext(); - if (!infCtx->init(inputDesc_, outputDesc_, - nnpiCompiledFunction_->getCompiledNetworkHandle(), - nnpiCompiledFunction_->getCompilationConfig(), - deviceNetwork_, adapter_, device_, - nnpiCompiledFunction_->getPartialInputs(), - nnpiCompiledFunction_->getStaticInputs(), deviceTracing_, - staticPlaceholderMap_, deviceOptions_, functionName_, - deviceId_, &phUsage)) { + if (!infCtx->init( + inputDesc_, outputDesc_, + nnpiCompiledFunction_->getCompiledNetworkHandle(), + nnpiCompiledFunction_->getCompilationConfig(), deviceNetwork_, + adapter_, device_, nnpiCompiledFunction_->getPartialInputs(), + nnpiCompiledFunction_->getStaticInputs(), staticPlaceholderMap_, + deviceOptions_, functionName_, deviceId_, &phUsage)) { delete infCtx; ASSERT_WITH_MSG(infCtx, "Failed to initialize detached inference context"); return nullptr; diff --git a/lib/Backends/NNPI/InferencePool.h b/lib/Backends/NNPI/InferencePool.h index 2510f96f85..2aa32c45ea 100644 --- a/lib/Backends/NNPI/InferencePool.h +++ b/lib/Backends/NNPI/InferencePool.h @@ -38,7 +38,6 @@ class InferencePoolEnv { std::vector freeContexts_; std::mutex freeContextsLock_; NNPIDeviceNetwork deviceNetwork_; - std::shared_ptr deviceTracing_; std::shared_ptr deviceOptions_; unsigned deviceId_; ResourceDescVec inputDesc_; @@ -53,7 +52,6 @@ class InferencePoolEnv { InferencePoolEnv(); ~InferencePoolEnv(); Error init(NNPIAdapter adapter, NNPIDeviceContext device, - std::shared_ptr deviceTracing, CompiledFunction *compiledFunction, StaticPlaceholderMap *staticPlaceholderMap, std::shared_ptr deviceOptions, diff --git a/lib/Backends/NNPI/NNPI.cpp b/lib/Backends/NNPI/NNPI.cpp index 1863c0768b..543c6e5c19 100644 --- a/lib/Backends/NNPI/NNPI.cpp +++ b/lib/Backends/NNPI/NNPI.cpp @@ -465,6 +465,7 @@ bool NNPIBackend::isOpSupported(const NodeInfo &NI) const { (NI.getInElemTy(BatchOneHotNode::LengthsIdx) == ElemKind::Int32ITy); case Kinded::Kind::NNPICustomDSPNodeKind: + case Kinded::Kind::NNPICustomIANodeKind: return true; case Kinded::Kind::SpaceToDepthNodeKind: @@ -499,6 +500,7 @@ bool NNPIBackend::shouldLower(const Node *N) const { case Kinded::Kind::TanhNodeKind: case Kinded::Kind::ReluNodeKind: case Kinded::Kind::ConvolutionNodeKind: + case Kinded::Kind::Convolution3DNodeKind: case Kinded::Kind::TileNodeKind: case Kinded::Kind::LogNodeKind: case Kinded::Kind::ReplaceNaNNodeKind: diff --git a/lib/Backends/NNPI/NNPICompiledFunction.cpp b/lib/Backends/NNPI/NNPICompiledFunction.cpp index 68320e8a7c..e67dfe9edd 100644 --- a/lib/Backends/NNPI/NNPICompiledFunction.cpp +++ b/lib/Backends/NNPI/NNPICompiledFunction.cpp @@ -61,6 +61,24 @@ static void trySetDeviceVersion(NNPICompilationOptions &compilationOptions) { compilationOptions.deviceVersion.setVal(*devVerOrErr + 1); } +/// Update device network config from the compilation config +static NNPIDeviceNetworkConfig parseDeviceNetworkConfig( + const glow::NNPICompilationOptions &compilationOptions) { + NNPIDeviceNetworkConfig cfg; + std::memset(&cfg, 0, sizeof(cfg)); + cfg.pnpHints.ringFrequencyPrio = compilationOptions.ringPrio; + cfg.pnpHints.iceBOFrequencyPrio[0] = compilationOptions.iceBOPrio0; + cfg.pnpHints.iceBOFrequencyPrio[1] = compilationOptions.iceBOPrio1; + cfg.pnpHints.iceBOFrequencyPrio[2] = compilationOptions.iceBOPrio2; + cfg.pnpHints.iceBOFrequencyPrio[3] = compilationOptions.iceBOPrio3; + cfg.pnpHints.iceBOFrequencyPrio[4] = compilationOptions.iceBOPrio4; + cfg.pnpHints.iceBOFrequencyPrio[5] = compilationOptions.iceBOPrio5; + cfg.pnpHints.IAFrequencyPrio[0] = compilationOptions.iaPrio0; + cfg.pnpHints.IAFrequencyPrio[1] = compilationOptions.iaPrio1; + cfg.pnpHints.DDRBandwidth = compilationOptions.ddrBandwidth; + return cfg; +} + Error NNPICompiledFunction::updateCompilationConfigFromOptions( NNPICompilationOptions &compilationOptions) { if (compilationOptions.showVars) { @@ -103,6 +121,11 @@ Error NNPICompiledFunction::updateCompilationConfigFromOptions( compilationOptions.debugCompileConfigFile.get().c_str(), sizeof(config_.debugConfigFile)); } + + config_.disableSLSOnIA = compilationOptions.disableSLSOnIA; + config_.enableLightweightCompilation = compilationOptions.lightCompilation; + config_.dumpDotFiles = compilationOptions.dumpDotFiles; + return Error::success(); } @@ -254,6 +277,7 @@ Error NNPICompiledFunction::compile(Function *F, const BackendOptions &opts) { NNPIImporter importer(compilationOptions_); network_ = importer.importFunction(F, newOpts); + iaExtensionPaths_ = importer.getIAExtensionPaths(); LOG_IF_INVALID_HANDLE_RETURN_LLVMERROR(network_, "Failed to import function"); // Setting the network name. @@ -350,6 +374,18 @@ Error NNPICompiledFunction::compile(Function *F, const BackendOptions &opts) { compilationFileName_.c_str(), NULL), "Failed NNPI Compile"); } + + // Update compilation info after NNPI compilation. + if (compilationOptions_.dumpCompilationInfo || + compilationOptions_.lightCompilation) { + if (!updateCompilationInfo()) { + // Only issuing a warning (soft fail) + LOG(WARNING) << "Failed to update NNPI compilation info"; + } else if (compilationOptions_.dumpCompilationInfo) { + LONG_LOG(INFO, compilationInfo_.dump(networkName)); + } + } + if (compilationOptions_.inferOnDevice) { DBG_MEM_USAGE("NNPICompiledFunction destroy network"); // NNPINetwork is not needed anymore on the inferfence api path. @@ -375,6 +411,10 @@ Error NNPICompiledFunction::compile(Function *F, const BackendOptions &opts) { staticInputs_.insert(P); } } + + // Update device network config. + devNetConfig_ = parseDeviceNetworkConfig(compilationOptions_); + return Error::success(); } @@ -382,6 +422,7 @@ NNPICompiledFunction::NNPICompiledFunction(Function *F) : CompiledFunction(runtime::RuntimeBundle::create(*F)), compilationOptions_({}) { std::memset(&config_, 0, sizeof(config_)); + std::memset(&devNetConfig_, 0, sizeof(devNetConfig_)); }; NNPICompiledFunction::~NNPICompiledFunction() { @@ -413,3 +454,165 @@ void NNPICompiledFunction::freeCompilationResources() { unlockCompiledStream(); DBG_MEM_USAGE("[After] freeCompilationResources "); } + +bool NNPICompiledFunction::updateCompilationInfo() { + // Clear existing info. + compilationInfo_.clear(); + + if (network_ == NNPI_INVALID_NNPIHANDLE) { + LOG(ERROR) << "Invalid NNPINetwork"; + return false; + } + + // Collect operators. + uint64_t numOps = 0; + LOG_NNPI_IF_ERROR_RETURN_FALSE(nnpiNetworkGetOpNum(network_, &numOps), + "Failed to get num ops"); + for (uint64_t op = 0; op < numOps; op++) { + NNPIOpInfo opInfo; + LOG_NNPI_IF_ERROR_RETURN_FALSE(nnpiNetworkGetOpInfo(network_, op, &opInfo), + "Failed to get op info"); + NNPICompiledOp compiledOp; + compiledOp.name = std::string(opInfo.name); + compiledOp.type = std::string(opInfo.type); + compiledOp.coreIndex = opInfo.coreIndex; + compiledOp.iceBo = opInfo.iceBo; + compiledOp.execType = opInfo.executionType; + for (uint32_t t = 0; t < opInfo.numTensors; t++) { + NNPITensorInfo tensorInfo; + LOG_NNPI_IF_ERROR_RETURN_FALSE( + nnpiNetworkGetOpTensorInfo(network_, op, t, &tensorInfo), + "Failed to get tensor info"); + NNPICompiledTensor compiledTensor; + compiledTensor.name = std::string(tensorInfo.name); + compiledTensor.type = std::string(tensorInfo.type); + compiledTensor.allocType = tensorInfo.allocation; + for (uint32_t d = 0; d < tensorInfo.numDims; d++) { + compiledTensor.shape.push_back(tensorInfo.dims[d]); + } + switch (tensorInfo.usage) { + case NNPI_TENSOR_USAGE_INPUT: + compiledOp.inputs.push_back(compiledTensor); + break; + case NNPI_TENSOR_USAGE_OUTPUT: + compiledOp.outputs.push_back(compiledTensor); + break; + default: + LOG(WARNING) << "Invalid tensor usage"; + break; + } + } + compilationInfo_.ops.insert({compiledOp.name, compiledOp}); + } + + // Collect dependencies. + uint64_t numDeps = 0; + LOG_NNPI_IF_ERROR_RETURN_FALSE( + nnpiNetworkGetOpDependenciesNum(network_, &numDeps), + "Failed to get num dependencies"); + + for (uint64_t dep = 0; dep < numDeps; dep++) { + NNPIObjectName src; + NNPIObjectName dst; + LOG_NNPI_IF_ERROR_RETURN_FALSE( + nnpiNetworkGetOpDependency(network_, dep, src, dst), + "Failed to get op dependency"); + compilationInfo_.opDependencies.push_back( + {std::string(src), std::string(dst)}); + } + + return true; +} + +std::string NNPICompiledTensor::dump() const { + std::stringstream stream; + stream << "name: " << name << ", type: " << type << " ("; + for (const auto &d : shape) { + stream << d << ","; + } + if (shape.size() > 0) { + stream.seekp(-1, stream.cur); + } + stream << "), allocation: "; + switch (allocType) { + case NNPI_ALLOCATION_DEFAULT: + stream << "Default"; + break; + case NNPI_ALLOCATION_DRAM: + stream << "DRAM"; + break; + case NNPI_ALLOCATION_ECC_DRAM: + stream << "ECC DRAM"; + break; + case NNPI_ALLOCATION_LLC: + case NNPI_ALLOCATION_LLC_CLOS0: + case NNPI_ALLOCATION_LLC_CLOS1: + case NNPI_ALLOCATION_LLC_CLOS2: + case NNPI_ALLOCATION_LLC_CLOS3: + stream << "LLC"; + break; + case NNPI_ALLOCATION_SRAM: + stream << "SRAM"; + break; + case NNPI_ALLOCATION_INTERNAL: + stream << "Internal"; + break; + default: + stream << "Unknown"; + break; + } + return stream.str(); +} + +std::string NNPICompiledOp::dump() const { + std::stringstream stream; + stream << " [Op] name: " << name << ", type: " << type << ", exec: "; + switch (execType) { + case NNPI_EXECUTION_IA: + stream << "IA"; + break; + case NNPI_EXECUTION_DSP: + stream << "DSP"; + break; + case NNPI_EXECUTION_DELPHI: + stream << "Delphi"; + break; + case NNPI_EXECUTION_DSE: + stream << "DSE"; + break; + case NNPI_EXECUTION_COMBINED: + stream << "Combined"; + break; + case NNPI_EXECUTION_NOT_SET: + stream << "NotSet"; + break; + default: + stream << "Unknown"; + break; + } + stream << ", core: " << coreIndex << ", iceBo: " << iceBo << "\n"; + for (const auto &in : inputs) { + stream << " [Input] " << in.dump() << "\n"; + } + for (const auto &out : outputs) { + stream << " [Output] " << out.dump() << "\n"; + } + + return stream.str(); +} + +std::string NNPICompilationInfo::dump(const std::string &functionName) const { + std::stringstream stream; + stream << "[Start] NNPI Compilation Info for function: \"" << functionName + << "\":\n"; + for (const auto &op : ops) { + stream << op.second.dump(); + } + for (const auto &dep : opDependencies) { + stream << " [Dep] " << dep.first << " -> " << dep.second << "\n"; + } + stream << "[End] NNPI Compilation Info for function: \"" << functionName + << "\":\n"; + + return stream.str(); +} diff --git a/lib/Backends/NNPI/NNPICompiledFunction.h b/lib/Backends/NNPI/NNPICompiledFunction.h index d4838981c2..a9ec1de6b2 100644 --- a/lib/Backends/NNPI/NNPICompiledFunction.h +++ b/lib/Backends/NNPI/NNPICompiledFunction.h @@ -21,6 +21,7 @@ #include "glow/Backend/CompiledFunction.h" #include "glow/Backends/BackendOptions.h" #include "glow/ExecutionContext/ExecutionContext.h" +#include "nnpi_inference_types.h" #include "nnpi_transformer.h" #include #include @@ -28,6 +29,38 @@ namespace glow { +/// Struct containing details exported for a compiled tensor. +struct NNPICompiledTensor { + std::string name; + std::string type; + std::vector shape; + NNPI_ALLOCATION_TYPE allocType; + std::string dump() const; +}; + +/// Struct containing details exported for a compiled operator. +struct NNPICompiledOp { + std::string name; + std::string type; + NNPI_EXECUTION_TYPE execType; + int32_t coreIndex; + int32_t iceBo; + std::vector inputs; + std::vector outputs; + std::string dump() const; +}; + +/// Collection of exported details for compiled functions. +struct NNPICompilationInfo { + std::map ops; + std::vector> opDependencies; + std::string dump(const std::string &functionName) const; + void clear() { + ops.clear(); + opDependencies.clear(); + } +}; + /// Function "compiled" for execution by the NNPI backend. class NNPICompiledFunction final : public CompiledFunction { public: @@ -85,6 +118,18 @@ class NNPICompiledFunction final : public CompiledFunction { return outputNames_; } + NNPIDeviceNetworkConfig getDeviceNetworkConfig() const { + return devNetConfig_; + } + + const std::vector &getIAExtensionPaths() const { + return iaExtensionPaths_; + } + + const NNPICompilationInfo &getCompilationInfo() const { + return compilationInfo_; + } + private: NNPINetwork network_; NNPICompilationConfig config_; @@ -96,6 +141,9 @@ class NNPICompiledFunction final : public CompiledFunction { std::string compilationFileName_; std::vector inputNames_; std::vector outputNames_; + NNPIDeviceNetworkConfig devNetConfig_; + std::vector iaExtensionPaths_; + NNPICompilationInfo compilationInfo_; Error updateCompilationConfigFromOptions( NNPICompilationOptions &compilationOptions); @@ -105,6 +153,9 @@ class NNPICompiledFunction final : public CompiledFunction { Error setupCompilationHints(const Function *F, const BackendSpecificNodeInfo &backendSpecificNodeInfo); + + /// Update the internal compilation info object. Return true iff successful. + bool updateCompilationInfo(); ///@} }; } // end namespace glow diff --git a/lib/Backends/NNPI/NNPIDeviceManager.cpp b/lib/Backends/NNPI/NNPIDeviceManager.cpp index a3915184f9..995fec18fd 100644 --- a/lib/Backends/NNPI/NNPIDeviceManager.cpp +++ b/lib/Backends/NNPI/NNPIDeviceManager.cpp @@ -119,9 +119,6 @@ Error NNPIDeviceManager::init() { LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( nnpiDeviceContextCreate(adapter_, deviceId_, &device_), "Failed to create NNPI Device"); - if (deviceOptions_->enabledDeviceTracing) { - deviceTracing_ = NNPIDeviceTracing::getForDevice(deviceId_); - } NNPIDeviceInfo deviceInfo; LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( nnpiDeviceGetInfo(deviceId_, &deviceInfo), @@ -187,8 +184,8 @@ void NNPIDeviceManager::addNetwork(const Module *module, functions_.emplace(func.first, func.second); usedMemoryBytes_ += functionCost_; // TODO:: static moduleSize. auto err = inferenceEnvs_[func.first].init( - adapter_, device_, deviceTracing_, func.second, &staticPlaceholders_, - deviceOptions_, func.first, deviceId_); + adapter_, device_, func.second, &staticPlaceholders_, deviceOptions_, + func.first, deviceId_); if (err) { functions_.erase(func.first); lock.unlock(); @@ -278,7 +275,13 @@ uint64_t NNPIDeviceManager::getAvailableMemory() const { LOG_NNPI_INF_IF_ERROR(res, "Failed to read available memory from device.") return 0; } - return static_cast(devStatus.availableUnprotectedMemory) * KB; + const auto availableMem = + static_cast(devStatus.availableUnprotectedMemory) * KB; + if (availableMem == 0) { + LOG(WARNING) << "NNPI Device " << deviceId_ + << " available memory: " << availableMem; + } + return availableMem; } auto freeMemory = getMaximumMemory(); for (const auto &p : functions_) { @@ -309,8 +312,9 @@ void NNPIDeviceManager::transferStaticPlaceholderToDevice( }; Error NNPIDeviceManager::startDeviceTrace(TraceContext *traceContext) { - if (!NNPIDeviceTracing::getForDevice(deviceId_)->start(traceContext, - device_)) { + if (!NNPIDeviceTracing::getForDevice(deviceId_)->start( + traceContext, device_, true /* Software traces are always enabled. */, + deviceOptions_->hardwareTraces)) { return MAKE_ERR("Failed to start NNPI device trace."); } return Error::success(); diff --git a/lib/Backends/NNPI/NNPIDeviceManager.h b/lib/Backends/NNPI/NNPIDeviceManager.h index 2ad2a717eb..26fbef6b4a 100644 --- a/lib/Backends/NNPI/NNPIDeviceManager.h +++ b/lib/Backends/NNPI/NNPIDeviceManager.h @@ -68,8 +68,6 @@ class NNPIDeviceManager : public DeviceManager { NNPIDeviceContext device_; /// Lock to synchronize function adding/removing to/from the device manager. std::mutex functionMapMutex_; - /// Device Tracing control. - std::shared_ptr deviceTracing_; /// Static placeholders known by the device manager (the device manager /// doesn't own a ref on static resources, only networks added to the device /// manager). diff --git a/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp b/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp index 6a57d36dee..c3398d533d 100644 --- a/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp +++ b/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp @@ -15,6 +15,8 @@ #include "NNPIMLTraceWrapper.h" #include "DebugMacros.h" +#include "nnpi_ice_caps_hwtrace.h" +#include "nnpi_ice_caps_swtrace.h" #include "nnpi_inference.h" #include #include @@ -26,8 +28,7 @@ #include #include -#define MAX_TRACE_BUFFER_SIZE (1024 * 1024 * 5) -#define TRACE_READ_BUFFER_SIZE (1024 * 10) +#define MAX_TRACE_BUFFER_SIZE (1024 * 1024 * 100) static inline uint64_t secondsToMicroseconds(double seconds) { return (uint64_t)(seconds * 1e6f); @@ -43,166 +44,30 @@ static uint64_t inline getNow() { .count(); } -enum NNPITraceColumnIndex { - NNPI_TRACE_PID_IDX = 0, - NNPI_TRACE_CPU_IDX = 1, - NNPI_TRACE_FLAG_IDX = 2, - NNPI_TRACE_TIMESTAMP_IDX = 3, - NNPI_TRACE_FUNCTION_IDX = 4, - NNPI_TRACE_DETAILS_IDX = 5 -}; - -class NNPITraceParser { -public: - void parseLine(std::string line, NNPITraceEntry &entry) { - size_t idx = 0; - std::istringstream linestream(line); - do { - std::string part; - linestream >> part; - - switch (idx) { - case NNPI_TRACE_PID_IDX: { - entry.processID = getPID(part); - break; - } - case NNPI_TRACE_CPU_IDX: { - entry.cpuID = getCPUID(part); - break; - } - case NNPI_TRACE_FLAG_IDX: { - getFlags(part, entry.flags_); - break; - } - case NNPI_TRACE_TIMESTAMP_IDX: { - entry.deviceUpTime = getOriginTime(part); - entry.hostTime = entry.deviceUpTime; - break; - } - case NNPI_TRACE_FUNCTION_IDX: { - entry.traceType = getType(part); - break; - } - case NNPI_TRACE_DETAILS_IDX: { - // NNPI_TRACE_MARK lines (identified at NNPI_TRACE_FUNCTION_IDX column) - // has a sub level function type. - if (entry.traceType == NNPI_TRACE_MARK && - part[part.size() - 1] == ':') { - entry.traceType = getType(part); - break; - } - // Not NNPI_TRACE_MARK: consider as params. - } - default: // Params. - { - addParam(part, entry); - } - } - idx++; - } while (linestream); - } - -protected: - uint32_t getPID(std::string part) { - std::istringstream partSplitStream(part); - std::string pid; - while (std::getline(partSplitStream, pid, '-')) - ; - return std::stoi(pid); - } - - uint32_t getCPUID(std::string part) { - std::string cpuStr = part.substr(1, part.size() - 2); - return std::stoi(cpuStr); - } - - uint64_t getOriginTime(std::string part) { - double dNumber = std::stod(part.substr(0, part.size() - 1)); - return secondsToMicroseconds(dNumber); - } - - void getFlags(std::string part, char *flags) { - if (part.size() != 4) { - return; - } - part.copy(flags, 4); - } - - NNPITraceType getType(std::string part) { - if (part == "dma:") { - return NNPI_TRACE_DMA; - } else if (part == "copy:") { - return NNPI_TRACE_COPY; - } else if (part == "cmdlist:") { - return NNPI_TRACE_CMDLIST; - } else if (part == "icedrvExecuteNetwork:") { - return NNPI_TRACE_NETEXEC; - } else if (part == "runtime-subgraph:") { - return NNPI_TRACE_SUBGRAPH; - } else if (part == "infreq:") { - return NNPI_TRACE_INFER; - } else if (part == "clock_sync:") { - return NNPI_TRACE_CLOCK_SYNC; - } else if (part == "tracing_mark_write:") { - return NNPI_TRACE_MARK; - } else if (part == "vtune_time_sync:") { - return NNPI_TARCE_TIME_SYNC; - } else if (part == "runtime-infer-request:") { - return NNPI_TRACE_RUNTIME_INFER; - } else if (part == "icedrvScheduleJob:") { - return NNPI_TRACE_ICED_SCHED_JOB; - } else if (part == "icedrvCreateNetwork:") { - return NNPI_TARCE_ICED_CREAT_NET; - } else if (part == "icedrvNetworkResource:") { - return NNPI_TARCE_ICED_NET_RES; - } else if (part == "icedrvEventGeneration:") { - return NNPI_TARCE_ICED_NET_GEN; - } else if (part == "user_data:") { - return NNPI_TARCE_USER_DATA; - } - return NNPI_TRACE_OTHER; - } - - bool addParam(std::string part, NNPITraceEntry &entry) { - std::string name; - std::string value; - std::istringstream partSplitStream(part); - std::getline(partSplitStream, name, '='); - std::getline(partSplitStream, value, '='); - - while (value[value.size() - 1] == ',') { - value = value.substr(0, value.size() - 2); - } - entry.params[name] = value; - return true; - } -}; - -#define NNPI_SOFTWARE_EVENTS \ - "cmdlist,copy,cpylist_create,icedrvCreateContext,icedrvCreateNetwork," \ - "icedrvDestroyContext,icedrvDestroyNetwork,icedrvEventGeneration," \ - "icedrvExecuteNetwork,icedrvNetworkResource,icedrvScheduleJob,inf_net_" \ - "subres,infreq,runtime_sw_events.runtime.infer,runtime_sw_events.runtime." \ - "subgraph,user_data" +static eIceCapsSwTraceEvent swEventTypes[] = { + eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_CMDLIST, + eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_COPY, + eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_CPYLIST_CREATE, + eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_ICE_DRV, + eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_INFR_SUBRES, + eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_INFR_CREATE, + eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_INFR_REQ, + eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_RUNTIME, + eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_USER_DATA}; NNPITraceContext::NNPITraceContext(unsigned devID) - : traceCtx_(0), devID_(devID), devIDSet_(false), - events_(NNPI_SOFTWARE_EVENTS) {} + : capsSession_(0), devID_(devID), devIDSet_(false) {} NNPITraceContext::~NNPITraceContext() { destroyInternalContext(); } -bool NNPITraceContext::startCapture(NNPIDeviceContext deviceContext) { - if (!createInternalContext()) { +bool NNPITraceContext::startCapture(NNPIDeviceContext deviceContext, + bool swTracess, bool hwTraces) { + if (!createInternalContext(swTracess, hwTraces)) { LOG(WARNING) << "nnpi_trace: Failed to create trace device context."; return false; } - nnpimlTraceOptions traceOptions; - std::memset(&traceOptions, 0, sizeof(nnpimlTraceOptions)); - traceOptions.max_bytes = MAX_TRACE_BUFFER_SIZE; - traceOptions.max_bytes_valid = true; - nnpimlStatus mlStatus = - nnpimlTraceStart(traceCtx_, devID_, &traceOptions, events_.c_str()); + nnpimlStatus mlStatus = nnpiIceCapsStart(capsSession_); if (mlStatus != NNPIML_SUCCESS) { LOG(WARNING) << "nnpi_trace: Failed to start trace, err=" << mlStatus; return false; @@ -215,88 +80,141 @@ bool NNPITraceContext::startCapture(NNPIDeviceContext deviceContext) { } bool NNPITraceContext::stopCapture(NNPIDeviceContext deviceContext) const { - uint32_t outBytes, discardEvents; LOG_NNPI_INF_IF_ERROR( nnpiDeviceContextTraceUserData(deviceContext, "EN", getNow()), "Failed to inject trace timestamp - device trace may not be " "synchronized"); - nnpimlStatus mlStatus = - nnpimlTraceStop(traceCtx_, devID_, &outBytes, &discardEvents); + nnpimlStatus mlStatus = nnpiIceCapsStop(capsSession_); if (mlStatus != NNPIML_SUCCESS) { return false; } return true; } -bool NNPITraceContext::readTraceOutput(std::stringstream &inputStream) { - char readData[TRACE_READ_BUFFER_SIZE + 1]; - uint32_t size = TRACE_READ_BUFFER_SIZE; - uint32_t actualSize = size; - // Read trace bytes into stream. - uint32_t offset = 0; - while (actualSize >= size) { - nnpimlStatus mlStatus = - nnpimlTraceRead(traceCtx_, devID_, offset, size, readData, &actualSize); - inputStream.write(readData, actualSize); - offset += actualSize; - if (mlStatus != NNPIML_SUCCESS) { - // Failed to read trace. - return false; - } +bool NNPITraceContext::readTraceOutput() { + nnpimlStatus mlStatus = nnpiIceCapsRead(capsSession_); + if (mlStatus != NNPIML_SUCCESS) { + // Failed to read trace. + LOG(WARNING) << "nnpi_trace: Failed to read traces from device, err=" + << mlStatus; + return false; + } + mlStatus = nnpiIceCapsParse(capsSession_); + if (mlStatus != NNPIML_SUCCESS) { + // Failed to read trace. + LOG(WARNING) << "nnpi_trace: Failed to parse traces on device, err=" + << mlStatus; + return false; } - return true; -} - -bool NNPITraceContext::load() { - entries_.clear(); - std::stringstream inputStream; - if (!readTraceOutput(inputStream)) { - destroyInternalContext(); + mlStatus = nnpiIceCapsProcess(capsSession_); + if (mlStatus != NNPIML_SUCCESS) { + // Failed to read trace. + LOG(WARNING) << "nnpi_trace: Failed to process traces on device, err=" + << mlStatus; + return false; + } + size_t entryCount = 0; + mlStatus = nnpiIceCapsGetEntriesCount(capsSession_, &entryCount); + if (mlStatus != NNPIML_SUCCESS) { + // Failed to read trace. + LOG(WARNING) << "nnpi_trace: Failed to read traces count, err=" << mlStatus; return false; } - destroyInternalContext(); - // Handle stream. - std::string line; - NNPITraceParser parser; bool started = false; uint64_t glowStart = 0; uint64_t glowEnd = 0; - uint64_t nnpiStart = 0; - uint64_t nnpiEnd = 0; + uint64_t deviceStart = 0; + uint64_t deviceEnd = 0; + uint64_t hostStart = 0; + uint64_t hostEnd = 0; + for (size_t i = 0; i < entryCount; i++) { + IceCapsEntry entry; + NNPITraceEntry traceEntry; + std::stringstream entryStrRep; + mlStatus = nnpiIceCapsGetEntry(capsSession_, i, &entry); + if (mlStatus != NNPIML_SUCCESS) { + // Failed to read trace. + LOG(WARNING) << "nnpi_trace: Failed to read trace entries, err=" + << mlStatus; + return false; + } - while (std::getline(inputStream, line)) { - if (line.find("#", 0) == 0) { - // Skip comment. - continue; + // Set parameters. + traceEntry.params["name"] = entry.event_name; + traceEntry.params["state"] = entry.state; + traceEntry.hostTime = entry.timestamp; + traceEntry.engineTime = entry.engine_timestamp; + traceEntry.params["engine"] = + ((entry.engine == eIceCapsEngine::ICE_CAPS_SW_TRACE) + ? std::string("SW") + : std::string("HW")); + traceEntry.params["event_key"] = std::to_string(entry.event_key); + traceEntry.params["device_id"] = std::to_string(entry.device_id); + traceEntry.params["context_id"] = std::to_string(entry.context_id); + traceEntry.params["network_id"] = std::to_string(entry.network_id); + traceEntry.params["infer_id"] = std::to_string(entry.infer_id); + traceEntry.params["ice_id"] = std::to_string(entry.ice_id); + traceEntry.params["core_id"] = std::to_string(entry.core_id); + traceEntry.params["network_name"] = entry.network_name; + traceEntry.params["kernel_name"] = entry.kernel_name; + traceEntry.params["opcode"] = entry.opcode; + + std::stringstream params; + for (size_t p = 0; p < entry.params_count; p++) { + IceCapsParam param; + mlStatus = nnpiIceCapsGetEntryParam(capsSession_, i, p, ¶m); + if (mlStatus != NNPIML_SUCCESS) { + // Failed to read params. + LOG(WARNING) << "nnpi_trace: Failed to read trace entry params, err=" + << mlStatus; + break; + } + traceEntry.params[param.name] = param.value; + params << param.name << ":" << param.value << ", "; } - NNPITraceEntry entry; - parser.parseLine(line, entry); - if (entry.traceType == NNPI_TARCE_USER_DATA) { - if (!started && entry.params["key"] == "BG") { - auto p = entry.params["user_data"]; - glowStart = std::stol(entry.params["user_data"]); - nnpiStart = entry.deviceUpTime; + + if (entry.state == "created" || entry.state == "queued" || + entry.state == "req" || entry.state == "add") { + entry.state = "q"; + } else if (entry.state == "executed" || entry.state == "cbs" || + entry.state == "start") { + entry.state = "s"; + } else if (entry.state == "completed" || entry.state == "cbc") { + entry.state = "c"; + } + traceEntry.params["state"] = entry.state; + entries_.push_back(traceEntry); + if (entry.event_name == "user_data" && + traceEntry.params.count("user_data") > 0 && + traceEntry.params.count("key") > 0) { + if (!started && traceEntry.params["key"] == "BG") { + glowStart = std::stol(traceEntry.params["user_data"]); + deviceStart = entry.engine_timestamp; + hostStart = entry.timestamp; started = true; - } else if (entry.params["key"] == "EN") { - auto p = entry.params["user_data"]; - glowEnd = std::stol(entry.params["user_data"]); - nnpiEnd = entry.deviceUpTime; - started = false; + } else if (traceEntry.params["key"] == "EN") { + glowEnd = std::stol(traceEntry.params["user_data"]); + deviceEnd = entry.engine_timestamp; + hostEnd = entry.timestamp; } } - if (started) { - entries_.push_back(entry); - } } - if (glowStart > 0 && glowEnd > 0 && nnpiStart > 0 && nnpiEnd > 0) { - // Calculate host time function. - double m = (double)(glowEnd - glowStart) / (double)(nnpiEnd - nnpiStart); - int64_t C = glowStart - m * nnpiStart; + // Sync clocks: + if (glowStart > 0 && glowEnd > 0 && hostStart > 0 && hostEnd > 0 && + deviceStart > 0 && deviceEnd > 0) { + // Calculate host time function for host time. + double hostM = + (double)(glowEnd - glowStart) / (double)(hostEnd - hostStart); + double deviceM = + (double)(glowEnd - glowStart) / (double)(deviceEnd - deviceStart); + int64_t hostC = glowStart - hostM * hostStart; + int64_t deviceC = glowStart - deviceM * deviceStart; // Update host time. for (NNPITraceEntry &entry : entries_) { - entry.hostTime = entry.deviceUpTime * m + C; + entry.hostTime = entry.hostTime * hostM + hostC; + entry.engineTime = entry.engineTime * deviceM + deviceC; } } else { LOG(WARNING) << "Failed to synchronize glow and nnpi device traces."; @@ -304,38 +222,100 @@ bool NNPITraceContext::load() { return true; } +bool NNPITraceContext::load() { + entries_.clear(); + std::stringstream inputStream; + + if (!readTraceOutput()) { + destroyInternalContext(); + return false; + } + destroyInternalContext(); + return true; +} + bool NNPITraceContext::destroyInternalContext() { - if (traceCtx_ == 0) { + if (capsSession_ == 0) { return false; } - nnpimlStatus mlStatus = nnpimlDestroyTraceContext(traceCtx_); - traceCtx_ = 0; + nnpimlStatus mlStatus = nnpiIceCapsCloseSession(capsSession_); + capsSession_ = 0; if (mlStatus != NNPIML_SUCCESS) { - LOG(WARNING) << "nnpi_trace: Failed to stop device trace, err=" << mlStatus; - traceCtx_ = 0; + LOG(WARNING) << "nnpi_trace: Failed to stop device trace session, err=" + << mlStatus; + capsSession_ = 0; return false; } return true; } -bool NNPITraceContext::createInternalContext() { - if (traceCtx_ != 0) { +bool NNPITraceContext::createInternalContext(bool swTraces, bool hwTraces) { + if (capsSession_ != 0) { return false; } - devMask_ = 1UL << devID_; - nnpimlStatus mlStatus = - nnpimlCreateTraceContext(devMask_, &traceCtx_, &devMask_); + nnpimlStatus mlStatus = nnpiIceCapsOpenSession(&capsSession_); if (mlStatus != NNPIML_SUCCESS) { - LOG(WARNING) << "nnpi_trace: Failed to start device trace, err=" - << mlStatus; - traceCtx_ = 0; + LOG(WARNING) << "nnpi_trace: Failed to trace session, err=" << mlStatus; + capsSession_ = 0; return false; } - if (!(1UL << devID_ & devMask_)) { - destroyInternalContext(); - LOG(WARNING) << "nnpi_trace: Cloud not open trace for device " << devID_; - return false; + devMask_ = 1UL << devID_; + if (swTraces) { + size_t swEventsCount = sizeof(swEventTypes) / sizeof(swEventTypes[0]); + size_t idx = 0; + IceCapsSwTraceConfig traceConfigs[1 + swEventsCount]; + traceConfigs[idx].traceOptions.config_type = + eIceCapsSwTraceConfigType::ICE_CAPS_SWTRACE_OPTIONS; + traceConfigs[idx].traceOptions.device_mask = devMask_; + traceConfigs[idx].traceOptions.max_bytes = MAX_TRACE_BUFFER_SIZE; + idx++; + for (size_t i = 0; i < swEventsCount; i++) { + traceConfigs[idx].traceEvent.config_type = + eIceCapsSwTraceConfigType::ICE_CAPS_SWTRACE_EVENT; + traceConfigs[idx].traceEvent.event = swEventTypes[i]; + idx++; + } + + IceCapsConfig iceSWCapsConfig; + iceSWCapsConfig.engine = eIceCapsEngine::ICE_CAPS_SW_TRACE; + iceSWCapsConfig.size = sizeof(traceConfigs); + iceSWCapsConfig.buffer = traceConfigs; + mlStatus = nnpiIceCapsPrepare(capsSession_, &iceSWCapsConfig); + if (mlStatus != NNPIML_SUCCESS) { + LOG(WARNING) + << "nnpi_trace: Failed to set device Software trace options, err=" + << mlStatus; + destroyInternalContext(); + return false; + } + } + if (hwTraces) { + IceCapsHwTraceConfig traceConfigs[2]; + traceConfigs[0].traceOptions.config_type = + eIceCapsHwTraceConfigType::ICE_CAPS_HWTRACE_OPTIONS; + traceConfigs[0].traceOptions.device_mask = devMask_; + traceConfigs[0].traceOptions.max_trace_size = MAX_TRACE_BUFFER_SIZE; + traceConfigs[1].iceFilter.config_type = + eIceCapsHwTraceConfigType::ICE_CAPS_HWTRACE_FILTER; + traceConfigs[1].iceFilter.ice_mask = 0xFFF; // All ICEs. + traceConfigs[1].iceFilter.filter_type = + eIceCapsHwTraceFilter::ICE_CAPS_HWTRACE_CAPTURE_ALL; + + IceCapsConfig iceHWCapsConfig; + iceHWCapsConfig.engine = eIceCapsEngine::ICE_CAPS_HW_TRACE; + iceHWCapsConfig.size = sizeof(traceConfigs); + iceHWCapsConfig.buffer = traceConfigs; + + mlStatus = nnpiIceCapsPrepare(capsSession_, &iceHWCapsConfig); + if (mlStatus != NNPIML_SUCCESS) { + LOG(WARNING) + << "nnpi_trace: Failed to set device Hardware trace options, err=" + << mlStatus; + destroyInternalContext(); + return false; + } } + return true; } \ No newline at end of file diff --git a/lib/Backends/NNPI/NNPIMLTraceWrapper.h b/lib/Backends/NNPI/NNPIMLTraceWrapper.h index 3d113c5304..42509163b2 100644 --- a/lib/Backends/NNPI/NNPIMLTraceWrapper.h +++ b/lib/Backends/NNPI/NNPIMLTraceWrapper.h @@ -17,37 +17,13 @@ #define NNPI_NNPITRACING_ML_WRAPPER_H #include +#include #include -#include #include -enum NNPITraceType { - NNPI_TRACE_UNKNOWN = 0x0000, - NNPI_TRACE_DMA = 0x0001, - NNPI_TRACE_INFER = 0x0002, - NNPI_TRACE_COPY = 0x0004, - NNPI_TRACE_MARK = 0x0008, - NNPI_TRACE_CLOCK_SYNC = 0x0010, - NNPI_TRACE_CMDLIST = 0x0020, - NNPI_TRACE_NETEXEC = 0x0040, - NNPI_TRACE_SUBGRAPH = 0x0080, - NNPI_TARCE_TIME_SYNC = 0x0100, - NNPI_TRACE_RUNTIME_INFER = 0x0200, - NNPI_TRACE_ICED_SCHED_JOB = 0x0400, - NNPI_TARCE_ICED_CREAT_NET = 0x0800, - NNPI_TARCE_ICED_NET_RES = 0x1000, - NNPI_TARCE_ICED_NET_GEN = 0x1001, - NNPI_TARCE_USER_DATA = 0x4000, - NNPI_TRACE_OTHER = 0x8000 -}; - struct NNPITraceEntry { - uint64_t deviceUpTime{0}; + uint64_t engineTime{0}; uint64_t hostTime{0}; - NNPITraceType traceType{NNPI_TRACE_UNKNOWN}; - uint32_t processID{0}; - uint32_t cpuID{0}; - char flags_[4]; std::map params; }; @@ -57,7 +33,8 @@ class NNPITraceContext { NNPITraceContext(unsigned devID); virtual ~NNPITraceContext(); /// Start capturing traces from the HW device. - bool startCapture(NNPIDeviceContext deviceContext); + bool startCapture(NNPIDeviceContext deviceContext, bool swTraces, + bool hwTraces); /// Start capturing. bool stopCapture(NNPIDeviceContext deviceContext) const; /// Load traces (valid only after stopCapture()). @@ -76,14 +53,13 @@ class NNPITraceContext { private: bool destroyInternalContext(); - bool createInternalContext(); - bool readTraceOutput(std::stringstream &inputStream); + bool createInternalContext(bool swTraces, bool hwTraces); + bool readTraceOutput(); - nnpimlTraceContext traceCtx_{0}; + IceCaps_t capsSession_{0}; uint64_t devMask_{0}; unsigned devID_{0}; bool devIDSet_{false}; - std::string events_; std::vector entries_; }; diff --git a/lib/Backends/NNPI/NNPIOptions.cpp b/lib/Backends/NNPI/NNPIOptions.cpp index ebf4fb4da9..0d7e690d0f 100644 --- a/lib/Backends/NNPI/NNPIOptions.cpp +++ b/lib/Backends/NNPI/NNPIOptions.cpp @@ -79,6 +79,10 @@ template <> unsigned NNPIOptions::getStringAsType(std::string sVal) { return 0; } +template <> float NNPIOptions::getStringAsType(std::string sVal) { + return std::strtof(sVal.c_str(), nullptr); +} + std::string NNPIOptions::dumpStatus() { std::stringstream desc; desc << "\nNNPI " << getOptionsName().data() << " variables\n"; diff --git a/lib/Backends/NNPI/NNPIOptions.h b/lib/Backends/NNPI/NNPIOptions.h index d9c30ddaba..a1c7206e53 100644 --- a/lib/Backends/NNPI/NNPIOptions.h +++ b/lib/Backends/NNPI/NNPIOptions.h @@ -80,15 +80,17 @@ class NNPIOptions { llvm::StringMap supportedOptions_; }; -/// Explicit forward decleration of template type. +/// Explicit forward declaration of template type. template <> bool NNPIOptions::getStringAsType(std::string sVal); -/// Explicit forward decleration of template type. +/// Explicit forward declaration of template type. template <> std::string NNPIOptions::getStringAsType(std::string sVal); -/// Explicit forward decleration of template type. +/// Explicit forward declaration of template type. template <> int NNPIOptions::getStringAsType(std::string sVal); -/// Explicit forward decleration of template type. +/// Explicit forward declaration of template type. template <> unsigned NNPIOptions::getStringAsType(std::string sVal); +/// Explicit forward declaration of template type. +template <> float NNPIOptions::getStringAsType(std::string sVal); #define DECLARE_NNPI_OPTION(VAR_NAME, VAR_TYPE, OPT_NAME, OPT_DESC, OPT_ENV, \ OPT_DEFAULT) \ @@ -257,6 +259,52 @@ class NNPICompilationOptions : public NNPIOptions { "Override the amount of worker threads allocated for the " "network on the device.", "NNPI_NUM_WORKERS", "2"); + /// Power & Performance hints. See more details at: + /// https://github.com/IntelAI/nnpi-sw/blob/master/include/nnpi_inference_types.h + DECLARE_NNPI_OPTION(ringPrio, float, "RingPrio", + "Set the ring frequency priority.", "NNPI_RING_PRIO", + "0.f"); + DECLARE_NNPI_OPTION(iceBOPrio0, float, "IceBOPrio0", + "Set ICE-BO 0 frequency priority.", "NNPI_ICEBO_PRIO0", + "0.f"); + DECLARE_NNPI_OPTION(iceBOPrio1, float, "IceBOPrio1", + "Set ICE-BO 1 frequency priority.", "NNPI_ICEBO_PRIO1", + "0.f"); + DECLARE_NNPI_OPTION(iceBOPrio2, float, "IceBOPrio2", + "Set ICE-BO 2 frequency priority.", "NNPI_ICEBO_PRIO2", + "0.f"); + DECLARE_NNPI_OPTION(iceBOPrio3, float, "IceBOPrio3", + "Set ICE-BO 3 frequency priority.", "NNPI_ICEBO_PRIO3", + "0.f"); + DECLARE_NNPI_OPTION(iceBOPrio4, float, "IceBOPrio4", + "Set ICE-BO 4 frequency priority.", "NNPI_ICEBO_PRIO4", + "0.f"); + DECLARE_NNPI_OPTION(iceBOPrio5, float, "IceBOPrio5", + "Set ICE-BO 5 frequency priority.", "NNPI_ICEBO_PRIO5", + "0.f"); + DECLARE_NNPI_OPTION(iaPrio0, float, "IAPrio0", "Set IA 0 frequency priority.", + "NNPI_IA_PRIO0", "0.f"); + DECLARE_NNPI_OPTION(iaPrio1, float, "IAPrio1", "Set IA 1 frequency priority.", + "NNPI_IA_PRIO1", "0.f"); + DECLARE_NNPI_OPTION(ddrBandwidth, float, "DDRBandwidth", + "Set an estimated DDR bandwidth in GB/s.", "NNPI_DDR_BW", + "0.f"); + /// Disable SLS on IA. + DECLARE_NNPI_OPTION(disableSLSOnIA, bool, "DisableSLSOnIA", + "Disable SLS exectuion on IA (SLS will execute on ICE).", + "NNPI_DISABLE_SLS_ON_IA", "1"); + /// Enable lightweight compilation. + DECLARE_NNPI_OPTION(lightCompilation, bool, "LightCompilation", + "Enable light compilation (only for gathering metadata).", + "NNPI_LIGHT_COMPILATION", "0"); + /// Dump compiler DOT files. + DECLARE_NNPI_OPTION(dumpDotFiles, bool, "DumpDotFiles", + "Dump Dot files of the network during compilation.", + "NNPI_DUMP_DOT", "0"); + /// Dump compilation info. + DECLARE_NNPI_OPTION(dumpCompilationInfo, bool, "dumpCompilationInfo", + "Dump the compilation info in text form.", + "NNPI_DUMP_COMP_INFO", "0"); NNPICompilationOptions(const BackendSpecificOptions ¶meters) { INIT_NNPI_OPTIONS(useIceT, parameters); @@ -274,6 +322,20 @@ class NNPICompilationOptions : public NNPIOptions { INIT_NNPI_OPTIONS(disableConstFolding, parameters); INIT_NNPI_OPTIONS(numWorkers, parameters); setLogLevel(this->compilationLogLevel); + INIT_NNPI_OPTIONS(ringPrio, parameters); + INIT_NNPI_OPTIONS(iceBOPrio0, parameters); + INIT_NNPI_OPTIONS(iceBOPrio1, parameters); + INIT_NNPI_OPTIONS(iceBOPrio2, parameters); + INIT_NNPI_OPTIONS(iceBOPrio3, parameters); + INIT_NNPI_OPTIONS(iceBOPrio4, parameters); + INIT_NNPI_OPTIONS(iceBOPrio5, parameters); + INIT_NNPI_OPTIONS(iaPrio0, parameters); + INIT_NNPI_OPTIONS(iaPrio1, parameters); + INIT_NNPI_OPTIONS(ddrBandwidth, parameters); + INIT_NNPI_OPTIONS(disableSLSOnIA, parameters); + INIT_NNPI_OPTIONS(lightCompilation, parameters); + INIT_NNPI_OPTIONS(dumpDotFiles, parameters); + INIT_NNPI_OPTIONS(dumpCompilationInfo, parameters); } virtual llvm::StringRef getOptionsName() const override { @@ -315,12 +377,11 @@ class NNPIDeviceOptions : public NNPIOptions { DECLARE_NNPI_OPTION(deviceId, int, "DeviceID", "Override the target device ID used to run (0,1,...).", "NNPI_DEVICE_ID", "-1"); - /// Setting this variable will enabled device tracing (host2device, - /// device2host copy infer etc.). - DECLARE_NNPI_OPTION( - enabledDeviceTracing, bool, "DeviceTracing", - "Enabled device tracing (host2device, device2host copy infer etc.).", - "NNPI_DEVICE_TRACING", "0"); + /// Enable Hardware Trace. + DECLARE_NNPI_OPTION(hardwareTraces, bool, "hardwareTraces", + "Enable hardware traces when device traces are started " + "(default is disabled).", + "NNPI_HW_TRACES", "0"); /// Override the max NNPI device memory. DECLARE_NNPI_OPTION( deviceMemory, unsigned, "DeviceMemory", @@ -362,7 +423,7 @@ class NNPIDeviceOptions : public NNPIOptions { INIT_NNPI_OPTIONS(inferOnDevice, parameters); INIT_NNPI_OPTIONS(showVars, parameters); INIT_NNPI_OPTIONS(deviceId, parameters); - INIT_NNPI_OPTIONS(enabledDeviceTracing, parameters); + INIT_NNPI_OPTIONS(hardwareTraces, parameters); INIT_NNPI_OPTIONS(deviceMemory, parameters); INIT_NNPI_OPTIONS(enabledCommandLists, parameters); INIT_NNPI_OPTIONS(dumpIOtoFiles, parameters); diff --git a/lib/Backends/NNPI/NNPITracing.cpp b/lib/Backends/NNPI/NNPITracing.cpp index 40281fd2f0..9d07e8e25c 100644 --- a/lib/Backends/NNPI/NNPITracing.cpp +++ b/lib/Backends/NNPI/NNPITracing.cpp @@ -21,148 +21,139 @@ using namespace glow; -NNPIDeviceTracing::NNPIDeviceTracing(unsigned deviceID) { - traceCtx_ = glow::make_unique(deviceID); +std::map NNPIDeviceTracing::activeAffinities_ = {}; + +NNPIDeviceTracing::NNPIDeviceTracing(unsigned deviceId) : deviceId_(deviceId) { + traceCtx_ = glow::make_unique(deviceId_); deviceInfo_ = - std::string("[Device #") + std::to_string(deviceID) + std::string("] "); + std::string("[Device #") + std::to_string(deviceId_) + std::string("] "); } bool NNPIDeviceTracing::start(TraceContext *traceContext, - NNPIDeviceContext deviceContext) { + NNPIDeviceContext deviceContext, bool swTraces, + bool hwTraces) { if (!traceContext || !traceContext->shouldLog(TraceEvent::TraceLevel::OPERATOR)) { return false; } if (started_.test_and_set()) { - ASSERT_WITH_MSG(glowTraceCtx_ != traceContext, - "Trying to start tracing for an already started context."); // Trace already started. return false; } - glowTraceCtx_ = traceContext; - if (!traceCtx_->startCapture(deviceContext)) { - LOG(WARNING) << "Failed to start trace capture"; + bool isFirstToStart = NNPIDeviceTracing::isFirstToChangeCaptureStart(true); + if (!traceCtx_->startCapture(deviceContext, swTraces, hwTraces)) { + LOG(WARNING) << "Failed to start trace capture for device " << deviceId_ + << " is first = " << (isFirstToStart); return false; } return true; } std::string NNPIDeviceTracing::getEntryName(NNPITraceEntry &entry) { - std::stringstream name; - name << deviceInfo_; - switch (entry.traceType) { - case NNPI_TRACE_UNKNOWN: - name << "UnknownTrace"; - break; - case NNPI_TRACE_DMA: - name << "DMA"; - break; - case NNPI_TRACE_INFER: - name << "Infer"; - break; - case NNPI_TRACE_COPY: - name << "Copy"; - break; - case NNPI_TRACE_MARK: - name << "MarkTrace"; - break; - case NNPI_TRACE_CLOCK_SYNC: - name << "ClockSync"; - break; - case NNPI_TRACE_CMDLIST: - name << "CommandList"; - break; - case NNPI_TRACE_NETEXEC: - name << "NetExecute"; - break; - case NNPI_TRACE_SUBGRAPH: - name << "SubGraph"; - break; - case NNPI_TRACE_RUNTIME_INFER: - name << "RunTimeInf"; - break; - case NNPI_TRACE_ICED_SCHED_JOB: - name << "DSchedJob"; - break; - case NNPI_TARCE_ICED_CREAT_NET: - name << "DCreateNet"; - break; - case NNPI_TARCE_ICED_NET_RES: - name << "DNetRes"; - break; - case NNPI_TARCE_ICED_NET_GEN: - name << "DNetGen"; - break; - default: - name << "Othertrace"; + std::string entryName = entry.params["name"]; + if (entryName.rfind("icedrv", 0) == 0) { + entryName = entryName.substr(strlen("icedrv")); + } else if (entryName.rfind("runtime-", 0) == 0) { + entryName = entryName.substr(strlen("runtime-")); + } + if (entry.params.count("command") > 0) { + entryName = entry.params["command"]; } + + std::stringstream name; + + name << entryName; if (entry.params.count("isC2H") > 0) { if (entry.params["isC2H"] == "1") { - name << "-Card2Host"; + name << " Card2Host"; } else { - name << "-Host2Card"; + name << " Host2Card"; } } auto params = entry.params; - if (entry.params.count("iceId") > 0) { - name << "-ICE_" << entry.params["iceId"]; + if (entry.params.count("ice_id") > 0) { + name << " ICE_" << entry.params["ice_id"]; } - if (entry.params.count("netID") > 0) { - name << "-NET_" << entry.params["netID"]; + if (entry.params.count("network_id") > 0) { + name << " Net " << entry.params["network_id"]; } - if (entry.params.count("reqID") > 0) { - name << "REQ_" << entry.params["reqID"]; + if (entry.params.count("network_name") > 0 && + entry.params["network_name"] != "NA") { + name << " NetName " << entry.params["network_name"]; } - if (entry.params.count("ctxID") > 0) { - name << "-CTX_" << entry.params["ctxID"]; + if (entry.params.count("context_id") > 0) { + name << " CTX 0x" << std::hex << std::stol(entry.params["context_id"]); } if (entry.params.count("subNetId") > 0) { - name << "-SUBNET_" << entry.params["subNetId"]; + name << " Subnet " << entry.params["subNetId"]; } - if (entry.params.count("inferID") > 0) { - name << "-INFR_" << entry.params["inferID"]; + if (entry.params.count("infer_id") > 0) { + name << " InfID " << entry.params["infer_id"]; } if (entry.params.count("subGraphID") > 0) { - name << "-SUBGRAPH_" << entry.params["subGraphID"]; + name << " Subgraph " << entry.params["subGraphID"]; } if (entry.params.count("agent") > 0) { - name << "-AGENT_" << entry.params["agent"]; + name << " Agent " << entry.params["agent"]; } - if (entry.params.count("copyID") > 0) { - name << "-CPID_" << entry.params["copyID"]; + if (entry.params.count("kernel_name") > 0 && + entry.params["kernel_name"] != "NA") { + name << " Krnl " << entry.params["kernel_name"]; } - if (entry.params.count("size") > 0) { - name << "-SIZE_" << entry.params["size"]; + if (entry.params.count("userHandle") > 0) { + name << " 0x" << std::hex << std::stol(entry.params["userHandle"]); } + return name.str(); } -bool NNPIDeviceTracing::addTrace(NNPITraceEntry &entry) { +int NNPIDeviceTracing::getAffinityID(NNPITraceEntry &entry, std::string name, + unsigned deviceId, + TraceContext *traceContext) { + // Need to be guarded when mutiple devices are active. + static std::mutex affinityMutext; + std::lock_guard lk(affinityMutext); + + // Start affinity at some high number to avoid collisions. + int affinId = 10000; + std::string iceId = entry.params["ice_id"]; + std::string contextId = entry.params["context_id"]; + std::stringstream affinityNameStuct; + + affinityNameStuct << "Device #" << deviceId << " ICE #" << iceId; + + // Add additional info to title. + if (entry.params["opcode"] != "NA") { + affinityNameStuct << " opcode " << entry.params["opcode"]; + } + // Use the op name. + affinityNameStuct << " " << name.substr(0, name.find(' ')); + if (entry.params["state"] == "q") { + affinityNameStuct << " Queue"; + } + + if (activeAffinities_.count(affinityNameStuct.str()) <= 0) { + affinId += activeAffinities_.size(); + activeAffinities_[affinityNameStuct.str()] = affinId; + traceContext->setThreadName(affinId, affinityNameStuct.str()); + } else { + affinId = activeAffinities_[affinityNameStuct.str()]; + } + + return affinId; +} + +bool NNPIDeviceTracing::addTrace( + NNPITraceEntry &entry, std::map &inflight, + TraceContext *traceContext) { + std::stringstream entryLog; + for (auto const ¶mEntry : entry.params) { + entryLog << paramEntry.first << ":" << paramEntry.second << " ,"; + } // Filter traces. - switch (entry.traceType) { - case NNPI_TRACE_INFER: - case NNPI_TRACE_COPY: - case NNPI_TRACE_CMDLIST: - case NNPI_TRACE_NETEXEC: - case NNPI_TRACE_SUBGRAPH: - case NNPI_TRACE_RUNTIME_INFER: - case NNPI_TRACE_ICED_SCHED_JOB: - case NNPI_TARCE_ICED_CREAT_NET: - case NNPI_TARCE_ICED_NET_RES: - case NNPI_TARCE_ICED_NET_GEN: - break; - case NNPI_TRACE_UNKNOWN: - case NNPI_TRACE_DMA: - case NNPI_TRACE_MARK: - case NNPI_TRACE_CLOCK_SYNC: - case NNPI_TARCE_TIME_SYNC: - case NNPI_TARCE_USER_DATA: - return false; - default: - LOG(WARNING) << "Trying to add unsupported trace type:" << entry.traceType; + if (entry.params["state"] == "NA") { return false; } - std::string name = getEntryName(entry); if (entry.params.count("state") <= 0) { @@ -170,29 +161,39 @@ bool NNPIDeviceTracing::addTrace(NNPITraceEntry &entry) { } std::string state = entry.params["state"]; - if (state == "q" || state == "queued") { + // Calculate affinity - use the trace thread id to make sections in the + // representation. + int affinId = + NNPIDeviceTracing::getAffinityID(entry, name, deviceId_, traceContext); + if (affinId <= 0) { + LOG(WARNING) << "Found unexpected affinity ID " << affinId << " for " + << name; + } + // Add events. + if (state == "q") { name += "-Queue"; - glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::InstantType, entry.hostTime, {}); - } else if (state == "s" || state == "cbs" || state == "executed") { - glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::BeginType, entry.hostTime, {}); - } else if (state == "c" || state == "cbc" || state == "completed") { - glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::EndType, entry.hostTime, {}); - } else if (state == "cbs") { - glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::BeginType, entry.hostTime, {}); - } else if (state == "cbc") { - glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::EndType, entry.hostTime, {}); - } else if (state == "cbnwc") { - glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::InstantType, entry.hostTime, {}); - } else if (state == "req") { - name += "-Req"; - glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::InstantType, entry.hostTime, {}); + traceContext->logTraceEvent(name, TraceLevel::OPERATOR, + TraceEvent::InstantType, entry.hostTime, + entry.params, affinId); + } else if (state == "s" && inflight.count(name) <= 0) { + inflight[name] = entry; + } else if (state == "c" && inflight.count(name) > 0) { + // Add only complate events. + if (entry.hostTime > inflight[name].hostTime) { + traceContext->logTraceEvent( + name, TraceLevel::OPERATOR, TraceEvent::BeginType, + inflight[name].hostTime, inflight[name].params, affinId); + traceContext->logTraceEvent(name, TraceLevel::OPERATOR, + TraceEvent::EndType, entry.hostTime, + entry.params, affinId); + } else { + LOG(WARNING) << "Fount incomplete trace event " << name; + } + inflight.erase(name); + } else if (state == "po") { + traceContext->logTraceEvent(name, TraceLevel::OPERATOR, + TraceEvent::InstantType, entry.hostTime, + entry.params, affinId); } return true; @@ -200,25 +201,25 @@ bool NNPIDeviceTracing::addTrace(NNPITraceEntry &entry) { bool NNPIDeviceTracing::stopAndUpdate(TraceContext *traceContext, NNPIDeviceContext deviceContext) { - if (glowTraceCtx_ != - nullptr && // For null glowTraceCtx assume global context (per device) - (glowTraceCtx_ != traceContext)) { - // Ignore stop from other contexts. + if (traceContext == nullptr) { + LOG(WARNING) << "Failed to stop trace capture trace context is null."; return false; } + bool isFirstToStop = NNPIDeviceTracing::isFirstToChangeCaptureStart(false); if (!traceCtx_->stopCapture(deviceContext)) { - LOG(WARNING) << "Failed to stop trace capture"; + LOG(WARNING) << "Failed to stop trace capture (first device stop =" + << isFirstToStop; return false; } if (!traceCtx_->load()) { - LOG(WARNING) << "Failed to stop trace capture"; + LOG(WARNING) << "Failed to stop trace capture =" << isFirstToStop; return false; } traceContext->setThreadName("NNPI_Trace"); + std::map inflight; for (auto entry : traceCtx_->getEntries()) { - std::map params = entry.params; - addTrace(entry); + addTrace(entry, inflight, traceContext); } started_.clear(); return true; diff --git a/lib/Backends/NNPI/NNPITracing.h b/lib/Backends/NNPI/NNPITracing.h index 7a1872b05a..2d118680a1 100644 --- a/lib/Backends/NNPI/NNPITracing.h +++ b/lib/Backends/NNPI/NNPITracing.h @@ -44,25 +44,42 @@ class NNPIDeviceTracing { return map[deviceId]; } + static bool isFirstToChangeCaptureStart(bool startCapture) { + static bool started = false; + static std::mutex firstDevStartMutex; + std::lock_guard lk(firstDevStartMutex); + if (started != startCapture) { + // First to change state. + started = startCapture; + return true; + } + + return false; + } + /// Dispose of tracing context. virtual ~NNPIDeviceTracing(){}; /// Start recording events. - bool start(TraceContext *traceContext, NNPIDeviceContext deviceContext); + bool start(TraceContext *traceContext, NNPIDeviceContext deviceContext, + bool swTraces, bool hwTraces); /// Stop recording, read and update trace context. bool stopAndUpdate(TraceContext *traceContext, NNPIDeviceContext deviceContext); protected: std::string getEntryName(NNPITraceEntry &entry); - bool addTrace(NNPITraceEntry &entry); + bool addTrace(NNPITraceEntry &entry, + std::map &inflight, + TraceContext *traceContext); + + /// Affinity has to be in a global for all devices. + static int getAffinityID(NNPITraceEntry &entry, std::string name, + unsigned deviceId, TraceContext *traceContext); private: /// Per device tracing control. explicit NNPIDeviceTracing(unsigned deviceId); - /// Glow trace context. Used to identify start/stop and log traces (with - /// runId_). - TraceContext *glowTraceCtx_{nullptr}; std::atomic_flag started_{false}; /// NNPI Trace context. std::unique_ptr traceCtx_; @@ -70,6 +87,9 @@ class NNPIDeviceTracing { unsigned deviceId_{0}; /// Device id string prefix for event names. std::string deviceInfo_; + + /// Trace active affinities. + static std::map activeAffinities_; }; } // namespace glow diff --git a/lib/Backends/NNPI/ONNX/NNPIONNXModelWriter.cpp b/lib/Backends/NNPI/ONNX/NNPIONNXModelWriter.cpp index 0694abb7ea..f135c0ed4e 100644 --- a/lib/Backends/NNPI/ONNX/NNPIONNXModelWriter.cpp +++ b/lib/Backends/NNPI/ONNX/NNPIONNXModelWriter.cpp @@ -18,3 +18,8 @@ Error ONNXModelWriter::writeNNPICustomDSP(glow::NNPICustomDSPNode const *, GraphType &graph) { return MAKE_ERR("Unsupported Op for ONNX"); } + +Error ONNXModelWriter::writeNNPICustomIA(glow::NNPICustomIANode const *, + GraphType &graph) { + return MAKE_ERR("Unsupported Op for ONNX"); +} diff --git a/lib/Backends/NNPI/tests/NNPIGradCheckTest.cpp b/lib/Backends/NNPI/tests/NNPIGradCheckTest.cpp index b2f31b0aee..8ec7425f29 100644 --- a/lib/Backends/NNPI/tests/NNPIGradCheckTest.cpp +++ b/lib/Backends/NNPI/tests/NNPIGradCheckTest.cpp @@ -44,6 +44,7 @@ struct BlacklistInitializer { {"gradientCheckBatchedPairwiseDotProduct/0", TestBlacklist::AnyDeviceAnyEngine}, {"gradientCheckFC2/0", TestBlacklist::AnyDeviceAnyEngine}, + {"gradientCheckBatchMatMul/0", TestBlacklist::AnyDeviceHWEngine}, }; TestBlacklist::prepareBlacklist(testBlacklistedSetups, backendTestBlacklist); diff --git a/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp b/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp index af6591bf55..a31c6c0e87 100644 --- a/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp +++ b/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp @@ -103,6 +103,8 @@ struct BlacklistInitializer { {"mul_int64/0", TestBlacklist::AnyDeviceHWEngine}, {"NonCubicKernelConv3DQuantized/0", TestBlacklist::AnyDeviceAnyEngine}, + {"NonCubicPaddingConv3D/0", TestBlacklist::AnyDeviceAnyEngine}, + {"GroupConv3D/0", TestBlacklist::AnyDeviceHWEngine}, {"NonSquarePaddingAveragePool/0", TestBlacklist::AnyDeviceAnyEngine}, {"NonSquarePaddingMaxPool/0", TestBlacklist::AnyDeviceAnyEngine}, @@ -135,6 +137,7 @@ struct BlacklistInitializer { TestBlacklist::AnyDeviceAnyEngine}, {"rowwiseQuantizedFCTest_Int8_BiasInt8/0", TestBlacklist::AnyDeviceAnyEngine}, + {"rowwiseQuantizedFCTestSymmetric/0", TestBlacklist::A0AnyEngine}, {"ScatterAddNDimensionalDuplicatingIndices/0", TestBlacklist::AnyDeviceAnyEngine}, {"ScatterAddNDimensionalSimple/0", @@ -166,11 +169,6 @@ struct BlacklistInitializer { TestBlacklist::AnyDeviceAnyEngine}, {"EmbeddingBag4BitRowwiseOffsets_Float16_AccumFloat/0", TestBlacklist::AnyDeviceAnyEngine}, - {"EmbeddingBag4BitRowwiseOffsets_Float16_HasEndOffset/0", - TestBlacklist::AnyDeviceAnyEngine}, - {"EmbeddingBag4BitRowwiseOffsets_Float16_HasEndOffset_AccumFloat/0", - TestBlacklist::AnyDeviceAnyEngine}, - {"SparseToDense_Float/0", TestBlacklist::AnyDeviceAnyEngine}, {"SparseToDense_Int64/0", TestBlacklist::AnyDeviceAnyEngine}, {"SparseToDenseMask1/0", TestBlacklist::AnyDeviceAnyEngine}, @@ -231,9 +229,6 @@ struct BlacklistInitializer { {"FusedRowwiseQuantizedSparseLengthsWeightedSum_ConvertedFloat16_" "NoFusedConvert_FP32Accum/0", TestBlacklist::AnyDeviceHWEngine}, - {"FusedRowwiseQuantizedSparseLengthsSum_Fused4Bit_Float16_" - "AccumFloat16/0", - TestBlacklist::AnyDeviceHWEngine}, {"to_back2/0", TestBlacklist::AnyDeviceHWEngine}, {"GroupDilatedConvolution/0", TestBlacklist::AnyDeviceHWEngine}, {"less_int32Cases/0", TestBlacklist::AnyDeviceHWEngine}, diff --git a/lib/Backends/NNPI/tests/TestBlacklist.h b/lib/Backends/NNPI/tests/TestBlacklist.h index 519b967e85..dd0934eff7 100644 --- a/lib/Backends/NNPI/tests/TestBlacklist.h +++ b/lib/Backends/NNPI/tests/TestBlacklist.h @@ -39,6 +39,7 @@ const uint32_t AnyDeviceHWEngine = AnyDeviceAnyEngine ^ NNPI_EXECUTION_ENGINE_SW; const uint32_t AnyDeviceSWEngine = AnyDeviceAnyEngine ^ NNPI_EXECUTION_ENGINE_HW; +const uint32_t A0AnyEngine = NNPI_DEVICE_VERSION_1 | NNPI_EXECUTION_ENGINE_ANY; static uint32_t getCurrentDeviceVersion() { static const std::map devices = {