From 7ccf14198b2151867e9e98ba93127b1f09134fda Mon Sep 17 00:00:00 2001 From: omromano Date: Tue, 7 Apr 2020 10:57:55 +0300 Subject: [PATCH] Update NNPI backend to 0.5.1.8 --- lib/Backends/NNPI/CMakeLists.txt | 18 +- lib/Backends/NNPI/Importer.cpp | 42 +-- lib/Backends/NNPI/InferenceContext.cpp | 319 ++++++++++++------ lib/Backends/NNPI/InferenceContext.h | 30 +- lib/Backends/NNPI/InferencePool.cpp | 160 +++++++-- lib/Backends/NNPI/InferencePool.h | 13 +- lib/Backends/NNPI/NNPI.cpp | 105 +++++- lib/Backends/NNPI/NNPI.h | 4 + lib/Backends/NNPI/NNPICompiledFunction.cpp | 50 ++- lib/Backends/NNPI/NNPICompiledFunction.h | 8 + lib/Backends/NNPI/NNPIDeviceManager.cpp | 44 ++- lib/Backends/NNPI/NNPIDeviceManager.h | 26 ++ lib/Backends/NNPI/NNPIOptions.h | 23 +- lib/Backends/NNPI/NNPIResource.cpp | 196 +++++++++-- lib/Backends/NNPI/NNPIResource.h | 58 +++- lib/Backends/NNPI/NNPIUtils.cpp | 106 ++++++ lib/Backends/NNPI/NNPIUtils.h | 24 ++ .../NNPI/tests/NNPIDeviceManagerTest.cpp | 2 - .../NNPI/tests/NNPIHostManagerTest.cpp | 3 +- lib/Backends/NNPI/tests/NNPIOperatorTest.cpp | 4 +- 20 files changed, 1001 insertions(+), 234 deletions(-) create mode 100644 lib/Backends/NNPI/NNPIUtils.cpp diff --git a/lib/Backends/NNPI/CMakeLists.txt b/lib/Backends/NNPI/CMakeLists.txt index b7fc9be440..b57cace4c3 100644 --- a/lib/Backends/NNPI/CMakeLists.txt +++ b/lib/Backends/NNPI/CMakeLists.txt @@ -56,10 +56,16 @@ if (DEFINED NNPI_MG_API_DIR) else() set(NNPI_MG_SEARCH_PATH "/opt/intel_nnpi/include/") endif() - -find_library(NNPI_TRANSFORMER_LIB nnpi_transformer ${NNPI_LIB_SEARCH_PATH}) -if(NOT NNPI_TRANSFORMER_LIB) - message(FATAL_ERROR "nnpi_transformer library not found at ${NNPI_LIB_SEARCH_PATH}") +if (NNPI_USE_STATIC_TRANSFORMER) + find_library(NNPI_TRANSFORMER_LIB nnpi_transformer_static ${NNPI_LIB_SEARCH_PATH}) + if(NOT NNPI_TRANSFORMER_LIB) + message(FATAL_ERROR "nnpi_transformer_static library not found at ${NNPI_LIB_SEARCH_PATH}") + endif() +else() + find_library(NNPI_TRANSFORMER_LIB nnpi_transformer ${NNPI_LIB_SEARCH_PATH}) + if(NOT NNPI_TRANSFORMER_LIB) + message(FATAL_ERROR "nnpi_transformer library not found at ${NNPI_LIB_SEARCH_PATH}") + endif() endif() find_library(NNPI_INFERENCE_LIB nnpi_inference ${NNPI_INF_LIB_SEARCH_PATH}) @@ -96,7 +102,7 @@ message(STATUS "[NNPI] NNPI_MG_API_DIR = ${NNPI_MG_API}") message(STATUS "[NNPI] NNPI_MG_LIB_DIR = ${NNPI_MG_LIB}") message(STATUS "[NNPI] GLOW_BINARY_DIR = ${GLOW_BINARY_DIR}") message(STATUS "[NNPI] NNPI_COLLECT_MEM_USAGE = ${NNPI_MEM_PROFILING}") - +message(STATUS "[NNPI] NNPI_USE_STATIC_TRANSFORMER = ${NNPI_USE_STATIC_TRANSFORMER}") add_subdirectory(ClassGen) @@ -124,6 +130,7 @@ add_library(NNPI NNPIResource.cpp NNPIUtils_AVX512.cpp NNPIAdapterContainer.cpp + NNPIUtils.cpp ) target_link_libraries(NNPI @@ -135,6 +142,7 @@ target_link_libraries(NNPI CodeGen IR Support + gomp ${NNPI_TRANSFORMER_LIB} ${NNPI_INFERENCE_LIB} ${NNPI_MG_LIB} diff --git a/lib/Backends/NNPI/Importer.cpp b/lib/Backends/NNPI/Importer.cpp index d2a9c8a747..b8e339ffa9 100644 --- a/lib/Backends/NNPI/Importer.cpp +++ b/lib/Backends/NNPI/Importer.cpp @@ -60,10 +60,6 @@ glow::NNPIImporter::NNPIImporter(const NNPICompilationOptions &compileOptions) compileOptions_(compileOptions) { ASSERT_LOG_NNPI_ERROR(nnpiNetworkCreate(&network_), "Failed to create NNPI network"); - // Setting the network name for testing framework purposes. - ASSERT_LOG_NNPI_ERROR( - nnpiNetworkSetName(network_, compileOptions_.compiledFile.get().c_str()), - "Failed to set NNPI network name"); } /// Destructor. @@ -1251,7 +1247,8 @@ class SLSNodeImporter : public INNPINodeImporter { nodeValueName(glowSLS->getResult()).c_str(), NULL, nodeValueName(glowSLS->getIndices()).c_str(), nodeValueName(glowSLS->getLengths()).c_str(), false, false, - glowSLS->getAvgLength(), lengthType); + glowSLS->getAvgLength(), lengthType, + /* force to IA */ false); } }; @@ -1284,7 +1281,8 @@ class SLWSNodeImporter : public INNPINodeImporter { nodeValueName(glowSLWS->getWeights()).c_str(), nodeValueName(glowSLWS->getIndices()).c_str(), nodeValueName(glowSLWS->getLengths()).c_str(), false, false, - glowSLWS->getAvgLength(), lengthType); + glowSLWS->getAvgLength(), lengthType, + /* force to IA */ false); } }; @@ -1323,7 +1321,8 @@ class EmbeddingBagNodeImporter : public INNPINodeImporter { nodeValueName(glowEmbeddingBag->getWeights()).c_str(), nodeValueName(glowEmbeddingBag->getIndices()).c_str(), nodeValueName(glowEmbeddingBag->getOffsets()).c_str(), false, true, - glowEmbeddingBag->getAvgLength(), lengthType); + glowEmbeddingBag->getAvgLength(), lengthType, + /* force to IA */ false); } }; @@ -1366,7 +1365,8 @@ class EmbeddingBagByteRowwiseOffsetsNodeImporter : public INNPINodeImporter { nodeValueName(glowEBBRO->getWeights()).c_str(), nodeValueName(glowEBBRO->getIndices()).c_str(), nodeValueName(glowEBBRO->getOffsets()).c_str(), usFp32Accum, true, - glowEBBRO->getAvgLength(), lengthType); + glowEBBRO->getAvgLength(), lengthType, + /* force to IA */ false); } }; @@ -1561,20 +1561,6 @@ class ChannelwiseQuantizedConvolutionNodeImporter : public INNPINodeImporter { LOG_AND_RETURN_IF_NOT(ERROR, glowChannelwiseQuantizedConv, "Bad node type", NNPI_INVALID_PARAM); - LOG_AND_RETURN_IF_NOT( - ERROR, - glowChannelwiseQuantizedConv->getInput().getType()->getOffset() == 0.f, - (std::string("Bad input offset value") + - std::to_string( - glowChannelwiseQuantizedConv->getInput().getType()->getOffset())), - NNPI_INVALID_PARAM); - LOG_AND_RETURN_IF_NOT( - ERROR, - glowChannelwiseQuantizedConv->getResult().getType()->getOffset() == 0.f, - (std::string("Bad result offset value") + - std::to_string( - glowChannelwiseQuantizedConv->getResult().getType()->getOffset())), - NNPI_INVALID_PARAM); LOG_AND_RETURN_IF_NOT( ERROR, !(glowChannelwiseQuantizedConv->getOffsets()) || @@ -1798,7 +1784,8 @@ class RQSLWSNodeImporter : public INNPINodeImporter { importer.addTensor(nodeValueName(glowSLWS->getData()), /* alternativeLayout */ false, nodeValueName(glowSLWS->getScales()), - nodeValueName(glowSLWS->getOffsets())), + nodeValueName(glowSLWS->getOffsets()), + /* force to IA */ false), "Failed to add tensor to NNPI"); importer.setUsedTensors( @@ -1828,7 +1815,8 @@ class RQSLWSNodeImporter : public INNPINodeImporter { nodeValueName(glowSLWS->getWeights()).c_str(), nodeValueName(glowSLWS->getIndices()).c_str(), nodeValueName(glowSLWS->getLengths()).c_str(), usFp32Accum, false, - glowSLWS->getAvgLength(), lengthType); + glowSLWS->getAvgLength(), lengthType, + /* force to IA */ false); } }; @@ -1864,7 +1852,8 @@ class FRQSLSNodeImporter : public INNPINodeImporter { nodeValueName(glowSLWS->getResult()).c_str(), NULL, nodeValueName(glowSLWS->getIndices()).c_str(), nodeValueName(glowSLWS->getLengths()).c_str(), usFp32Accum, false, - glowSLWS->getAvgLength(), lengthType); + glowSLWS->getAvgLength(), lengthType, + /* force to IA */ false); } }; @@ -1902,7 +1891,8 @@ class FRQSLWSNodeImporter : public INNPINodeImporter { nodeValueName(glowSLWS->getWeights()).c_str(), nodeValueName(glowSLWS->getIndices()).c_str(), nodeValueName(glowSLWS->getLengths()).c_str(), usFp32Accum, false, - glowSLWS->getAvgLength(), lengthType); + glowSLWS->getAvgLength(), lengthType, + /* force to IA */ false); } }; diff --git a/lib/Backends/NNPI/InferenceContext.cpp b/lib/Backends/NNPI/InferenceContext.cpp index e62e76588a..c74971d66f 100755 --- a/lib/Backends/NNPI/InferenceContext.cpp +++ b/lib/Backends/NNPI/InferenceContext.cpp @@ -43,17 +43,19 @@ InferenceContext::~InferenceContext() { } bool InferenceContext::init( + const ResourceDescVec &inputs, const ResourceDescVec &outputs, // For ICE-Ref path. NNPINetwork network, NNPICompilationConfig config, // For ICE-T path. - NNPIHostNetwork hostNetwork, NNPIDeviceNetwork deviceNetwork, - NNPIAdapter adapter, NNPIDeviceContext device, + NNPIDeviceNetwork deviceNetwork, NNPIAdapter adapter, + NNPIDeviceContext device, const std::unordered_set &partialInputs, const std::unordered_set &staticInputs, std::shared_ptr deviceTracing, StaticPlaceholderMap *staticPlaceholderMap, std::shared_ptr deviceOptions, - const std::string &functionName, unsigned deviceId) { + const std::string &functionName, unsigned deviceId, + PlaceholderUsageMap *phUsage) { deviceOptions_ = deviceOptions; deviceId_ = deviceId; nnpiNetwork_ = network; @@ -83,49 +85,52 @@ bool InferenceContext::init( staticInputs_.insert(staticInput); } + const size_t numInputs = inputs.size(); + const size_t numOutputs = outputs.size(); + if (!deviceOptions_->inferOnDevice) { - size_t numInputs, numOutputs; - NNPIObjectName name; - NNPITensorDesc desc; - LOG_NNPI_IF_ERROR_RETURN_FALSE( - nnpiNetworkGetInputNum(nnpiNetwork_, &numInputs), - "Failed to query NNPI network inputs"); - for (size_t i = 0; i < numInputs; i++) { - LOG_NNPI_IF_ERROR_RETURN_FALSE( - nnpiNetworkGetInputDesc(nnpiNetwork_, i, name, &desc), - "Failed to query NNPI network inputs"); - LOG_AND_RETURN_IF( - ERROR, !deviceOptions_->useIceT && staticPlaceholders.count(name), - "ICE-Ref doesn't support static inputs", false); - inputResources_.emplace_back(std::make_shared()); - NNPIResourceDesc rDesc; - LOG_AND_RETURN_IF( - ERROR, !NNPIResource::UpdateResourceDescFromTensorDesc(&rDesc, &desc), - "Failed to update ResourceDesc", false); - LOG_AND_RETURN_IF(ERROR, - !inputResources_.back()->init( - name, deviceOptions_, adapter, device_, &rDesc, - NNPIResource::ResourceUsage::InputResource), - "Failed to init input resource", false); + // No P2P/DRT for ICE-Ref (everything is on the host). + for (auto &in : inputs) { + const auto *name = in.first.c_str(); + const auto &desc = in.second; + const bool isStaticInput = staticPlaceholders.count(name) != 0; + if (isStaticInput) { + // Treat as a static input. + auto PH = staticPlaceholders.at(name); + if (staticPlaceholderMap->count(PH) && + staticPlaceholderMap->at(PH).lock()) { + // Static placeholder already exists. + inputResources_.push_back(staticPlaceholderMap->at(PH).lock()); + } else { + // Create a new static placeholder. + inputResources_.emplace_back(std::make_shared()); + LOG_AND_RETURN_IF( + ERROR, + !inputResources_.back()->init( + name, deviceOptions_, adapter, device_, &desc, + NNPIResource::ResourceUsage::StaticInputResource), + "Failed to init static input resource", false); + staticPlaceholderMap->insert({PH, inputResources_.back()}); + } + } else { + inputResources_.emplace_back(std::make_shared()); + LOG_AND_RETURN_IF(ERROR, + !inputResources_.back()->init( + name, deviceOptions_, adapter, device_, &desc, + NNPIResource::ResourceUsage::InputResource), + "Failed to init input resource", false); + } } - LOG_NNPI_IF_ERROR_RETURN_FALSE( - nnpiNetworkGetOutputNum(nnpiNetwork_, &numOutputs), - "Failed to query NNPI network outputs"); - for (size_t i = 0; i < numOutputs; i++) { - LOG_NNPI_IF_ERROR_RETURN_FALSE( - nnpiNetworkGetOutputDesc(nnpiNetwork_, i, name, &desc), - "Failed to query NNPI network outputs"); + for (auto &out : outputs) { + const auto *name = out.first.c_str(); + const auto &desc = out.second; LOG_AND_RETURN_IF( ERROR, !deviceOptions_->useIceT && staticPlaceholders.count(name), "ICE-Ref doesn't support static outputs", false); outputResources_.emplace_back(std::make_shared()); - NNPIResourceDesc rDesc; - LOG_AND_RETURN_IF( - ERROR, !NNPIResource::UpdateResourceDescFromTensorDesc(&rDesc, &desc), - "Failed to update ResourceDesc", false); LOG_AND_RETURN_IF(ERROR, !outputResources_.back()->init( - name, deviceOptions_, adapter, device_, &rDesc, + name, deviceOptions_, adapter, device_, &desc, NNPIResource::ResourceUsage::OutputResource), "Failed to init input resource", false); } @@ -133,24 +138,16 @@ bool InferenceContext::init( return true; // Nothing else to be done here for ice-ref. } - // Query input/output resources. - uint32_t numInputs, numOutputs; - LOG_NNPI_INF_IF_ERROR_RETURN_FALSE( - nnpiHostNetworkGetInputNum(hostNetwork, &numInputs), - "Failed to query NNPI network inputs"); - LOG_NNPI_INF_IF_ERROR_RETURN_FALSE( - nnpiHostNetworkGetOutputNum(hostNetwork, &numOutputs), - "Failed to query NNPI network outputs"); - // Create resources for inputs. - for (uint32_t i = 0; i < numInputs; i++) { - NNPIObjectName name; - NNPIResourceDesc desc; - LOG_NNPI_INF_IF_ERROR_RETURN_FALSE( - nnpiHostNetworkGetInputDesc(hostNetwork, i, name, &desc), - "Failed to query NNPI host network input"); - memset(&desc.hostAttrib, 0, sizeof(desc.hostAttrib)); - memset(&desc.deviceAttrib, 0, sizeof(desc.deviceAttrib)); + for (auto &in : inputs) { + const auto *name = in.first.c_str(); + const auto &desc = in.second; + + LOG_AND_RETURN_IF(ERROR, + phUsage && + ((phUsage->count(name) == 0) || + (phUsage->at(name).devices.count(device_) == 0)), + "Invalid placheholder usage for input resource", false); const auto isStaticInput = staticPlaceholders.count(name); if (isStaticInput) { @@ -171,36 +168,74 @@ bool InferenceContext::init( staticPlaceholderMap->insert({PH, inputResources_.back()}); } } else { - // Regular input resource - create it here. + // Dynamic input resource - create it here. + NNPIResource::ResourceUsage usage = NNPIResource::ResourceUsage::None; + if (!phUsage || (phUsage->at(name).numWriters == 0)) { + usage = NNPIResource::ResourceUsage::InputResource; // Net input + } else { // Some other context is writing to this placeholder --> P2P/DRT + switch (phUsage->at(name).devices.size()) { + case 1: // DRT + usage = NNPIResource::ResourceUsage::DRTInput; + break; + case 2: // P2P + usage = NNPIResource::ResourceUsage::P2PInput; + break; + default: + LOG_AND_RETURN_IF(ERROR, true, + "Invalid number of devices accessing a resource", + false); + } + } inputResources_.emplace_back(std::make_shared()); LOG_AND_RETURN_IF(ERROR, - !inputResources_.back()->init( - name, deviceOptions_, adapter, device_, &desc, - NNPIResource::ResourceUsage::InputResource), + !inputResources_.back()->init(name, deviceOptions_, + adapter, device_, &desc, + usage, phUsage), "Failed to init input resource", false); - inputResources_.back()->SetCmdListIdx( - static_cast(inputResources_.size())); + } + + // Update placeholder usage. + if (phUsage) { + phUsage->at(name).readers.push_back(inputResources_.back()); } } // Create resources for outputs. - for (uint32_t i = 0; i < numOutputs; i++) { - { - NNPIObjectName name; - NNPIResourceDesc desc; - LOG_NNPI_INF_IF_ERROR_RETURN_FALSE( - nnpiHostNetworkGetOutputDesc(hostNetwork, i, name, &desc), - "Failed to query NNPI host network output"); - memset(&desc.hostAttrib, 0, sizeof(desc.hostAttrib)); - memset(&desc.deviceAttrib, 0, sizeof(desc.deviceAttrib)); - outputResources_.emplace_back(std::make_shared()); - LOG_AND_RETURN_IF(ERROR, - !outputResources_.back()->init( - name, deviceOptions_, adapter, device_, &desc, - NNPIResource::ResourceUsage::OutputResource), - "Failed to init output resource", false); - outputResources_.back()->SetCmdListIdx( - static_cast(outputResources_.size())); + for (auto &out : outputs) { + const auto *name = out.first.c_str(); + const auto &desc = out.second; + LOG_AND_RETURN_IF(ERROR, + phUsage && + ((phUsage->count(name) == 0) || + (phUsage->at(name).devices.count(device_) == 0)), + "Invalid placheholder usage for output resource", false); + NNPIResource::ResourceUsage usage = NNPIResource::ResourceUsage::None; + if (!phUsage || (phUsage->at(name).numReaders == 0)) { + usage = NNPIResource::ResourceUsage::OutputResource; // Net output + } else { // Some other context is writing to this placeholder --> P2P/DRT + switch (phUsage->at(name).devices.size()) { + case 1: // DRT + usage = NNPIResource::ResourceUsage::DRTOutput; + break; + case 2: // P2P + usage = NNPIResource::ResourceUsage::P2POutput; + break; + default: + LOG_AND_RETURN_IF(ERROR, true, + "Invalid number of devices accessing a resource", + false); + } + } + outputResources_.emplace_back(std::make_shared()); + LOG_AND_RETURN_IF(ERROR, + !outputResources_.back()->init(name, deviceOptions_, + adapter, device_, &desc, + usage, phUsage), + "Failed to init output resource", false); + + // Update placeholder usage. + if (phUsage) { + phUsage->at(name).writers.push_back(outputResources_.back()); } } DBG_MEM_USAGE("Created input and output host resources"); @@ -209,10 +244,10 @@ bool InferenceContext::init( NNPIDeviceResource inputHandles[numInputs]; NNPIDeviceResource outputHandles[numOutputs]; for (uint32_t i = 0; i < numInputs; i++) { - inputHandles[i] = inputResources_.at(i)->GetDeviceResource(); + inputHandles[i] = inputResources_.at(i)->getDeviceResource(); } for (uint32_t i = 0; i < numOutputs; i++) { - outputHandles[i] = outputResources_.at(i)->GetDeviceResource(); + outputHandles[i] = outputResources_.at(i)->getDeviceResource(); } LOG_NNPI_INF_IF_ERROR_RETURN_FALSE( nnpiInferCommandCreate(deviceNetwork, inputHandles, numInputs, @@ -220,13 +255,13 @@ bool InferenceContext::init( "Failed to create NNPI inference command"); if (deviceOptions_->enabledCommandLists > 0) { - // collect copy commands for the list (some resources may not need copying). + // Collect copy commands for the list (some resources may not need copying). std::vector commands; std::vector inputCopyCmds, outputCopyCmds; for (auto &res : inputResources_) { - auto copyCmd = res->GetCopyCommand(); + auto copyCmd = res->getCopyCommand(); if (copyCmd) { - res->SetCmdListIdx(static_cast(commands.size())); + res->setCmdListIdx(static_cast(commands.size())); NNPICommandHandle cmd; cmd.type = NNPI_COMMAND_TYPE_COPY; cmd.copyCommand = copyCmd; @@ -240,9 +275,9 @@ bool InferenceContext::init( commands.push_back(cmd); } for (auto &res : outputResources_) { - auto copyCmd = res->GetCopyCommand(); + auto copyCmd = res->getCopyCommand(); if (copyCmd) { - res->SetCmdListIdx(static_cast(commands.size())); + res->setCmdListIdx(static_cast(commands.size())); NNPICommandHandle cmd; cmd.type = NNPI_COMMAND_TYPE_COPY; cmd.copyCommand = copyCmd; @@ -263,6 +298,10 @@ bool InferenceContext::init( cmdListErrors_.resize(commands.size()); } + if (deviceOptions_->dumpRuntime) { + dumpRuntime(); + } + return true; } @@ -287,10 +326,10 @@ void InferenceContext::execute(RunIdentifierTy runId, // outputResources_. if (netInputPlaceholders_.empty()) { for (const auto &in : inputResources_) { - if (in->GetUsage() == NNPIResource::ResourceUsage::StaticInputResource) { + if (in->getUsage() == NNPIResource::ResourceUsage::StaticInputResource) { continue; } - auto *placeholder = bindings.getPlaceholderByName(in->GetName()); + auto *placeholder = bindings.getPlaceholderByName(in->getName()); if (!placeholder) { netInputPlaceholders_.clear(); LOG_AND_FAIL_EXECUTE_CALLBACK_IF_NOT(ERROR, placeholder, @@ -303,7 +342,7 @@ void InferenceContext::execute(RunIdentifierTy runId, } if (netOutputPlaceholders_.empty()) { for (const auto &out : outputResources_) { - auto *placeholder = bindings.getPlaceholderByName(out->GetName()); + auto *placeholder = bindings.getPlaceholderByName(out->getName()); if (!placeholder) { netOutputPlaceholders_.clear(); LOG_AND_FAIL_EXECUTE_CALLBACK_IF_NOT(ERROR, placeholder, @@ -327,17 +366,17 @@ void InferenceContext::execute(RunIdentifierTy runId, std::vector rawInputs, rawOutputs; unsigned idx = 0; for (const auto &in : inputResources_) { - if (in->GetUsage() != NNPIResource::ResourceUsage::StaticInputResource) { + if (in->getUsage() != NNPIResource::ResourceUsage::StaticInputResource) { auto *t = bindings.get(netInputPlaceholders_[idx++]); LOG_AND_FAIL_EXECUTE_CALLBACK_IF_NOT( ERROR, t, "Can't find tensor for input", runId, ctx, resultCB); LOG_AND_FAIL_EXECUTE_CALLBACK_IF_NOT( ERROR, - in->PreInference(t, partialTensorInputs.count(t)) == + in->preInference(t, partialTensorInputs.count(t)) == NNPI_INF_NO_ERROR, "Failed pre-inference for input", runId, ctx, resultCB); } - rawInputs.push_back(in->GetHostPtr()); + rawInputs.push_back(in->getHostPtr()); } // Inference. @@ -355,7 +394,7 @@ void InferenceContext::execute(RunIdentifierTy runId, // Queue output copies for (auto &res : outputResources_) { - auto cmd = res->GetCopyCommand(); + auto cmd = res->getCopyCommand(); if (cmd) { // todo: assert no partial output LOG_AND_CALLBACK_EXECUTE_NNPI_INF_IF_ERROR( @@ -367,9 +406,9 @@ void InferenceContext::execute(RunIdentifierTy runId, // Prepare updates for partial copies. uint32_t usedConfigs = 0; for (auto &res : inputResources_) { - const auto partialSize = res->GetPartialSize(); + const auto partialSize = res->getPartialSize(); if (partialSize > 0) { - cmdConfigs_[usedConfigs].index = res->GetCmdListIdx(); + cmdConfigs_[usedConfigs].index = res->getCmdListIdx(); cmdConfigs_[usedConfigs].type = NNPI_COMMAND_TYPE_COPY; cmdConfigs_[usedConfigs].copyConfig.size = partialSize; usedConfigs++; @@ -430,7 +469,7 @@ void InferenceContext::execute(RunIdentifierTy runId, for (auto &out : outputResources_) { // Collect output ptrs for ICE-Ref - rawOutputs.push_back(out->GetHostPtr()); + rawOutputs.push_back(out->getHostPtr()); } TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY, @@ -458,8 +497,8 @@ void InferenceContext::execute(RunIdentifierTy runId, LOG_AND_FAIL_EXECUTE_CALLBACK_IF_NOT( ERROR, t, "Can't find tensor for output", runId, ctx, resultCB); LOG_AND_FAIL_EXECUTE_CALLBACK_IF_NOT( - ERROR, outputResources_[i]->PostInference(t) == NNPI_INF_NO_ERROR, - "Failed in output PostInference", runId, ctx, resultCB); + ERROR, outputResources_[i]->postInference(t) == NNPI_INF_NO_ERROR, + "Failed in output postInference", runId, ctx, resultCB); } TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY, @@ -473,5 +512,91 @@ void InferenceContext::execute(RunIdentifierTy runId, resultCB(runId, Error::success(), std::move(ctx)); } +void InferenceContext::dumpRuntime() const { + for (auto &in : inputResources_) { + std::string resourceType; + unsigned color; + switch (in->getUsage()) { + case NNPIResource::ResourceUsage::InputResource: + resourceType = "Input"; + color = 2; + + // Add host resource node + DotWriter::addNode(std::to_string(in->getHostResource()), + std::string("Host Resource\\lHandle: ") + + DotWriter::getHexStr(in->getHostResource()), + 1, "Host"); + // Add copy command h2c + DotWriter::addEdge(std::to_string(in->getHostResource()), + std::to_string(in->getDeviceResource())); + break; + case NNPIResource::ResourceUsage::StaticInputResource: + resourceType = "Static Input"; + color = 3; + break; + case NNPIResource::ResourceUsage::P2PInput: + resourceType = "P2P Input"; + color = 4; + break; + case NNPIResource::ResourceUsage::DRTInput: + resourceType = "DRT In/Out"; + color = 5; + break; + default:; // do nothing + } + + // add device resource node + DotWriter::addNode(std::to_string(in->getDeviceResource()), + resourceType + std::string("\\lHandle: ") + + DotWriter::getHexStr(in->getDeviceResource()), + color, std::to_string(in->getDevice())); + + // connect to function + DotWriter::addEdge(std::to_string(in->getDeviceResource()), + functionName_ + ":" + in->getName()); + } + for (auto &out : outputResources_) { + std::string resourceType; + unsigned color; + switch (out->getUsage()) { + case NNPIResource::ResourceUsage::OutputResource: + resourceType = "Output"; + color = 2; + + // Add host resource node + DotWriter::addNode(std::to_string(out->getHostResource()), + std::string("Host Resource\\lHandle: ") + + DotWriter::getHexStr(out->getHostResource()), + 1, "Host"); + // Add copy command c2h + DotWriter::addEdge(std::to_string(out->getDeviceResource()), + std::to_string(out->getHostResource())); + break; + case NNPIResource::ResourceUsage::P2POutput: + resourceType = "P2P Output"; + color = 4; + // Add copy command c2c + DotWriter::addEdge(std::to_string(out->getDeviceResource()), + std::to_string(out->getP2PDeviceResource())); + break; + case NNPIResource::ResourceUsage::DRTOutput: + resourceType = "DRT In/Out"; + color = 5; + break; + default:; // do nothing + } + + // add device resource node + DotWriter::addNode(std::to_string(out->getDeviceResource()), + resourceType + std::string("\\lHandle: ") + + DotWriter::getHexStr(out->getDeviceResource()), + color, std::to_string(out->getDevice())); + + // connect to function + DotWriter::addEdge(functionName_ + ":" + out->getName(), + std::to_string(out->getDeviceResource())); + } +} + } // namespace runtime } // namespace glow diff --git a/lib/Backends/NNPI/InferenceContext.h b/lib/Backends/NNPI/InferenceContext.h index 28e83f5a75..eae9dee660 100644 --- a/lib/Backends/NNPI/InferenceContext.h +++ b/lib/Backends/NNPI/InferenceContext.h @@ -23,6 +23,7 @@ #include "nnpi_inference.h" #include "nnpi_transformer.h" #include +#include #include namespace glow { @@ -32,6 +33,7 @@ class NNPIDeviceManager; using StaticPlaceholderMap = std::unordered_map>; +using ResourceDescVec = std::vector>; class InferenceContext { private: NNPINetwork nnpiNetwork_; // For ice-ref path only. @@ -77,23 +79,27 @@ class InferenceContext { std::string traceInferenceContextName_; std::string tracePostProcessContextName_; + /// Dump the runtime resource graph. + void dumpRuntime() const; + public: InferenceContext(); ~InferenceContext(); void execute(RunIdentifierTy runId, std::unique_ptr ctx, runtime::ResultCBTy resultCB); - bool init( - // For ICE-Ref path. - NNPINetwork network, NNPICompilationConfig config, - // For ICE-T path. - NNPIHostNetwork hostNetwork, NNPIDeviceNetwork deviceNetwork, - NNPIAdapter adapter, NNPIDeviceContext device, - const std::unordered_set &partialInputs, - const std::unordered_set &staticInputs, - std::shared_ptr deviceTracing, - StaticPlaceholderMap *staticPlaceholderMap, - std::shared_ptr deviceOptions, - const std::string &functionName, unsigned deviceId); + bool init(const ResourceDescVec &inputs, const ResourceDescVec &outputs, + // For ICE-Ref path. + NNPINetwork network, NNPICompilationConfig config, + // For ICE-T path. + NNPIDeviceNetwork deviceNetwork, NNPIAdapter adapter, + NNPIDeviceContext device, + const std::unordered_set &partialInputs, + const std::unordered_set &staticInputs, + std::shared_ptr deviceTracing, + StaticPlaceholderMap *staticPlaceholderMap, + std::shared_ptr deviceOptions, + const std::string &functionName, unsigned deviceId, + PlaceholderUsageMap *phUsage = nullptr); }; } // namespace runtime diff --git a/lib/Backends/NNPI/InferencePool.cpp b/lib/Backends/NNPI/InferencePool.cpp index 463c0479b2..bb3463db1d 100644 --- a/lib/Backends/NNPI/InferencePool.cpp +++ b/lib/Backends/NNPI/InferencePool.cpp @@ -18,6 +18,7 @@ #include "Importer.h" #include "NNPI.h" #include "NNPIDeviceManager.h" +#include "NNPIUtils.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -27,16 +28,11 @@ namespace glow { namespace runtime { InferencePoolEnv::InferencePoolEnv() - : numWorkers_(0), hostNetwork_(NNPI_INVALID_NNPIHANDLE), - deviceOptions_(nullptr) {} + : numWorkers_(0), deviceOptions_(nullptr), nnpiCompiledFunction_(nullptr), + staticPlaceholderMap_(nullptr) {} InferencePoolEnv::~InferencePoolEnv() { if (deviceOptions_ && deviceOptions_->inferOnDevice) { - if (hostNetwork_ != NNPI_INVALID_NNPIHANDLE) { - LOG_NNPI_INF_IF_ERROR(nnpiHostNetworkDestroy(hostNetwork_), - "Failed to destroy NNPI host network"); - hostNetwork_ = NNPI_INVALID_NNPIHANDLE; - } if (deviceNetwork_ != NNPI_INVALID_NNPIHANDLE) { LOG_NNPI_INF_IF_ERROR(nnpiDeviceNetworkDestroy(deviceNetwork_), "Failed to destroy NNPI device network"); @@ -55,6 +51,9 @@ Error InferencePoolEnv::init(unsigned numWorkers, NNPIAdapter adapter, unsigned deviceId) { deviceOptions_ = deviceOptions; deviceId_ = deviceId; + functionName_ = functionName; + device_ = device; + adapter_ = adapter; if (workersPool_) { return MAKE_ERR("InferencePool already initialized!"); } @@ -62,6 +61,7 @@ Error InferencePoolEnv::init(unsigned numWorkers, NNPIAdapter adapter, workersPool_ = glow::make_unique( numWorkers_, std::make_shared("NNPI-worker")); deviceTracing_ = deviceTracing; + staticPlaceholderMap_ = staticPlaceholderMap; inferenceContexts_.resize(numWorkers_); freeContexts_.resize(numWorkers_); @@ -70,14 +70,15 @@ Error InferencePoolEnv::init(unsigned numWorkers, NNPIAdapter adapter, } // Create host network. - auto *nnpiFunction = static_cast(compiledFunction); + nnpiCompiledFunction_ = static_cast(compiledFunction); + NNPIHostNetwork hostNetwork(NNPI_INVALID_NNPIHANDLE); if (deviceOptions_->inferOnDevice) { // Create NNPI host network (load compiled binary). - auto filename = nnpiFunction->getCompilationFilename(); + auto filename = nnpiCompiledFunction_->getCompilationFilename(); if (filename.empty()) // Create network from memory. { NNPIHostStream inputStream; - inputStream.userData = &(nnpiFunction->lockCompiledStream()); + inputStream.userData = &(nnpiCompiledFunction_->lockCompiledStream()); inputStream.readCallback = [](void *ptr, uint64_t size, uint64_t count, void *userData) -> uint64_t { BlockStream *ss = reinterpret_cast(userData); @@ -88,51 +89,113 @@ Error InferencePoolEnv::init(unsigned numWorkers, NNPIAdapter adapter, inputStream.seekCallback = NULL; DBG_MEM_USAGE("call nnpiHostNetworkCreateFromStream"); LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( - nnpiHostNetworkCreateFromStream(adapter, &inputStream, &hostNetwork_), + nnpiHostNetworkCreateFromStream(adapter, &inputStream, &hostNetwork), "Failed to create NNPI host network"); DBG_MEM_USAGE("done nnpiHostNetworkCreateFromStream"); - nnpiFunction->unlockCompiledStream(); + nnpiCompiledFunction_->unlockCompiledStream(); } else // Create network from file. { filename += ".zip"; LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( nnpiHostNetworkCreateFromFile(adapter, filename.c_str(), - &hostNetwork_), + &hostNetwork), "Failed to create NNPI host network"); } DBG_MEM_USAGE("call nnpiDeviceNetworkCreate"); // Create NNPI device network (deploy to device). LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( - nnpiDeviceNetworkCreate(device, hostNetwork_, nullptr, &deviceNetwork_), + nnpiDeviceNetworkCreate(device, hostNetwork, nullptr, &deviceNetwork_), "Failed to create NNPI device network"); DBG_MEM_USAGE("done nnpiDeviceNetworkCreate"); - if (nnpiFunction->getCompilationOptions().reserveResources) { + if (nnpiCompiledFunction_->getCompilationOptions().reserveResources) { LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( nnpiDeviceNetworkReserveExecResources(deviceNetwork_, UINT32_MAX), "Failed to reserve resources for device network"); } + + // Collect input/output descriptors from host network + uint32_t numInputs, numOutputs; + LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( + nnpiHostNetworkGetInputNum(hostNetwork, &numInputs), + "Failed to query NNPI network inputs"); + LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( + nnpiHostNetworkGetOutputNum(hostNetwork, &numOutputs), + "Failed to query NNPI network outputs"); + NNPIObjectName name; + NNPIResourceDesc desc; + for (uint32_t i = 0; i < numInputs; i++) { + NNPIObjectName name; + NNPIResourceDesc desc; + LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( + nnpiHostNetworkGetInputDesc(hostNetwork, i, name, &desc), + "Failed to query NNPI host network input"); + memset(&desc.hostAttrib, 0, sizeof(desc.hostAttrib)); + memset(&desc.deviceAttrib, 0, sizeof(desc.deviceAttrib)); + inputDesc_.push_back({name, desc}); + } + for (uint32_t i = 0; i < numOutputs; i++) { + + LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( + nnpiHostNetworkGetOutputDesc(hostNetwork, i, name, &desc), + "Failed to query NNPI host network output"); + memset(&desc.hostAttrib, 0, sizeof(desc.hostAttrib)); + memset(&desc.deviceAttrib, 0, sizeof(desc.deviceAttrib)); + outputDesc_.push_back({name, desc}); + } + } else { + // Collect input/output descriptors from nnpi network (for ICE-Ref) + size_t numInputs, numOutputs; + NNPIObjectName name; + NNPITensorDesc desc; + auto nnpiNetwork = nnpiCompiledFunction_->getCompiledNetworkHandle(); + LOG_NNPI_IF_ERROR_RETURN_LLVMERROR( + nnpiNetworkGetInputNum(nnpiNetwork, &numInputs), + "Failed to query NNPI network inputs"); + LOG_NNPI_IF_ERROR_RETURN_LLVMERROR( + nnpiNetworkGetOutputNum(nnpiNetwork, &numOutputs), + "Failed to query NNPI network outputs"); + + for (size_t i = 0; i < numInputs; i++) { + LOG_NNPI_IF_ERROR_RETURN_LLVMERROR( + nnpiNetworkGetInputDesc(nnpiNetwork, i, name, &desc), + "Failed to query NNPI network input"); + NNPIResourceDesc rDesc; + LOG_IF_NOT_RETURN_LLVMERROR( + NNPIResource::updateResourceDescFromTensorDesc(&rDesc, &desc), + "Failed to update ResourceDesc"); + inputDesc_.push_back({name, rDesc}); + } + for (size_t i = 0; i < numOutputs; i++) { + LOG_NNPI_IF_ERROR_RETURN_LLVMERROR( + nnpiNetworkGetOutputDesc(nnpiNetwork, i, name, &desc), + "Failed to query NNPI network output"); + NNPIResourceDesc rDesc; + LOG_IF_NOT_RETURN_LLVMERROR( + NNPIResource::updateResourceDescFromTensorDesc(&rDesc, &desc), + "Failed to update ResourceDesc"); + outputDesc_.push_back({name, rDesc}); + } } for (auto &infCtx : inferenceContexts_) { auto success = infCtx.init( - nnpiFunction->getCompiledNetworkHandle(), - nnpiFunction->getCompilationConfig(), hostNetwork_, deviceNetwork_, - adapter, device, nnpiFunction->getPartialInputs(), - nnpiFunction->getStaticInputs(), deviceTracing_, staticPlaceholderMap, - deviceOptions, functionName, deviceId_); + inputDesc_, outputDesc_, + nnpiCompiledFunction_->getCompiledNetworkHandle(), + nnpiCompiledFunction_->getCompilationConfig(), deviceNetwork_, adapter, + device, nnpiCompiledFunction_->getPartialInputs(), + nnpiCompiledFunction_->getStaticInputs(), deviceTracing_, + staticPlaceholderMap_, deviceOptions_, functionName_, deviceId_); if (!success) { return MAKE_ERR("Failed to initialize inferece context"); } freeContexts_.push_back(&infCtx); } - if (deviceOptions_->inferOnDevice && - hostNetwork_ != NNPI_INVALID_NNPIHANDLE) { + if (deviceOptions_->inferOnDevice && hostNetwork != NNPI_INVALID_NNPIHANDLE) { DBG_MEM_USAGE("call nnpiHostNetworkDestroy"); - LOG_NNPI_INF_IF_ERROR(nnpiHostNetworkDestroy(hostNetwork_), + LOG_NNPI_INF_IF_ERROR(nnpiHostNetworkDestroy(hostNetwork), "Failed to destroy NNPI host network"); - hostNetwork_ = NNPI_INVALID_NNPIHANDLE; DBG_MEM_USAGE("done nnpiHostNetworkDestroy"); } return Error::success(); @@ -150,6 +213,17 @@ void InferencePoolEnv::execute(RunIdentifierTy runId, runtime::ResultCBTy resultCB) { workersPool_->add([this, runId, ctx = std::move(ctx), resultCB = std::move(resultCB)]() mutable { + NNPIDeviceBindings *bindings = + dynamic_cast(ctx->getDeviceBindings()); + if (bindings) { + // TODO: verify with garret we don't need to lock here - i.e. host manager + // can't invoke the same context twice in parallel. + auto infCtx = bindings->getInferenceContext(); + CHECK(infCtx); + infCtx->execute(runId, std::move(ctx), resultCB); + return; + } + InferenceContext *infCtx = nullptr; { const std::lock_guard lock(freeContextsLock_); @@ -165,5 +239,43 @@ void InferencePoolEnv::execute(RunIdentifierTy runId, }); } +InferenceContext * +InferencePoolEnv::createDetachedInferenceContext(PlaceholderUsageMap &phUsage) { + if (deviceOptions_->dumpRuntime) { + // Add function node to graph dump. + std::ostringstream label; + label << "{{"; + for (auto input : nnpiCompiledFunction_->getInputNames()) { + label << "<" << input << ">" << input << "|"; + } + label.seekp(-1, label.cur); // remove the trailing '|' + label << "}|{" + << "Function\\lname : " << functionName_ << "}|{"; + for (auto output : nnpiCompiledFunction_->getOutputNames()) { + label << "<" << output << ">" << output << "|"; + } + label.seekp(-1, label.cur); // remove the trailing '|' + label << "}}"; + DotWriter::addNode(functionName_, label.str(), 0, std::to_string(device_)); + } + + InferenceContext *infCtx = new InferenceContext(); + + if (!infCtx->init(inputDesc_, outputDesc_, + nnpiCompiledFunction_->getCompiledNetworkHandle(), + nnpiCompiledFunction_->getCompilationConfig(), + deviceNetwork_, adapter_, device_, + nnpiCompiledFunction_->getPartialInputs(), + nnpiCompiledFunction_->getStaticInputs(), deviceTracing_, + staticPlaceholderMap_, deviceOptions_, functionName_, + deviceId_, &phUsage)) { + delete infCtx; + ASSERT_WITH_MSG(infCtx, "Failed to initialize detached inference context"); + return nullptr; + } + + return infCtx; +} + } // namespace runtime } // namespace glow diff --git a/lib/Backends/NNPI/InferencePool.h b/lib/Backends/NNPI/InferencePool.h index 574523e3ec..c9a49671e7 100644 --- a/lib/Backends/NNPI/InferencePool.h +++ b/lib/Backends/NNPI/InferencePool.h @@ -20,6 +20,7 @@ #include "NNPICompiledFunction.h" #include "NNPITracing.h" #include "folly/executors/CPUThreadPoolExecutor.h" +#include "glow/ExecutionContext/ExecutionContext.h" #include "glow/Runtime/RuntimeTypes.h" #include "nnpi_inference.h" #include "nnpi_transformer.h" @@ -30,18 +31,24 @@ namespace glow { namespace runtime { - +class NNPIDeviceBindings; class InferencePoolEnv { unsigned numWorkers_; std::unique_ptr workersPool_; std::vector inferenceContexts_; std::vector freeContexts_; std::mutex freeContextsLock_; - NNPIHostNetwork hostNetwork_; NNPIDeviceNetwork deviceNetwork_; std::shared_ptr deviceTracing_; std::shared_ptr deviceOptions_; unsigned deviceId_; + ResourceDescVec inputDesc_; + ResourceDescVec outputDesc_; + NNPIAdapter adapter_; + NNPIDeviceContext device_; + NNPICompiledFunction *nnpiCompiledFunction_; + StaticPlaceholderMap *staticPlaceholderMap_; + std::string functionName_; public: InferencePoolEnv(); @@ -52,6 +59,8 @@ class InferencePoolEnv { StaticPlaceholderMap *staticPlaceholderMap, std::shared_ptr deviceOptions, const std::string &functionName, unsigned deviceId); + InferenceContext * + createDetachedInferenceContext(PlaceholderUsageMap &phUsage); void stop(bool block); void execute(RunIdentifierTy runId, std::unique_ptr ctx, runtime::ResultCBTy resultCB); diff --git a/lib/Backends/NNPI/NNPI.cpp b/lib/Backends/NNPI/NNPI.cpp index 8a56c38032..5a33ad1ec3 100644 --- a/lib/Backends/NNPI/NNPI.cpp +++ b/lib/Backends/NNPI/NNPI.cpp @@ -15,8 +15,10 @@ #include "NNPI.h" #include "DebugMacros.h" +#include "InferenceContext.h" #include "NNPICompiledFunction.h" #include "NNPIDeviceManager.h" +#include "NNPIUtils.h" #include "glow/Graph/Nodes.h" #include "glow/Graph/Utils.h" #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" @@ -26,6 +28,8 @@ #include "llvm/Support/CommandLine.h" #include +#include +#include using namespace glow; @@ -84,22 +88,22 @@ NNPIBackendOptions NNPIBackend::backendOptions_; NNPIAdapterContainer NNPIBackend::adapter_; unsigned NNPIBackend::numDevices() { - if (!backendOptions_.inferOnDevice) { - // Will return 1 device (for ICE-Ref) - return 1; - } NNPIAdapter adapter = NNPI_INVALID_NNPIHANDLE; NNPIAdapterInfo adapterInfo; memset(&adapterInfo, 0, sizeof(adapterInfo)); + // Assuming ICE-Ref will be used if not able to create adaper of get adapter + // info (returning 1 device). LOG_AND_RETURN_IF_NOT( ERROR, nnpiAdapterCreate(nullptr, &adapter) == NNPI_INF_NO_ERROR, - "Failed to create NNPI Adapter.", 0); + "Failed to create NNPI Adapter.", 1); LOG_AND_RETURN_IF_NOT( ERROR, nnpiAdapterGetInfo(adapter, &adapterInfo) == NNPI_INF_NO_ERROR, - "Failed get device info.", 0); + "Failed get device info.", 1); + unsigned count = adapterInfo.numDevices; LOG_NNPI_INF_IF_ERROR(nnpiAdapterDestroy(adapter), "Failed to destroy NNPI Adapter"); - return adapterInfo.numDevices; + // Will return 1 device (for ICE-Ref) if 0 devices are found. + return std::max(count, (unsigned)1); } /// \returns whether \p type is 2 dimensional and unary. Usually the data input @@ -315,8 +319,10 @@ bool NNPIBackend::isOpSupported(const NodeInfo &NI) const { ElemKind::Int8QTy) && (NI.getInElemTy(ChannelwiseQuantizedConvolutionNode::FilterIdx) == ElemKind::Int8QTy) && - (NI.getInElemTy(ChannelwiseQuantizedConvolutionNode::BiasIdx) == - ElemKind::Int32QTy) && + ((NI.getInElemTy(ChannelwiseQuantizedConvolutionNode::BiasIdx) == + ElemKind::Int32QTy) || + (NI.getInElemTy(ChannelwiseQuantizedConvolutionNode::BiasIdx) == + ElemKind::FloatTy)) && (NI.getInElemTy(ChannelwiseQuantizedConvolutionNode::ScalesIdx) == ElemKind::FloatTy) && (NI.getInElemTy(ChannelwiseQuantizedConvolutionNode::OffsetsIdx) == @@ -433,7 +439,7 @@ bool NNPIBackend::isOpSupported(const NodeInfo &NI) const { case Kinded::Kind::SoftMaxNodeKind: return NI.allInputsAndOutputsHaveSameElemKind( - {ElemKind::FloatTy, ElemKind::Float16Ty}, + {ElemKind::FloatTy, ElemKind::Float16Ty, ElemKind::Int8QTy}, {SoftMaxNode::SelectedIdx}) && (NI.getInElemTy(SoftMaxNode::SelectedIdx) == ElemKind::Int64ITy); @@ -1105,3 +1111,82 @@ Expected NNPIBackend::transformPostLowering( return changed; } + +// Traverse the DAG and collect nodes in post order. +static void +traversePostOrder(const runtime::DAGNode *root, + std::unordered_set &visited, + std::vector &postOrder) { + if (root == nullptr) { + return; + } + visited.insert(root); + for (auto &c : root->children) { + if (visited.count(c) == 0) { + traversePostOrder(c, visited, postOrder); + } + } + postOrder.push_back(root); +} + +Error NNPIBackend::bindContexts( + llvm::ArrayRef bindings, + const runtime::DAGNode *root, bool enableP2P, bool enableDRT) { + LOG(INFO) << "enableP2P/DRT not yet implemented. enableDRT = " << enableDRT + << ", enableP2P = " << enableP2P << ".\n"; + if (backendOptions_.dumpRuntime) { + DotWriter::clear(); + DotWriter::addSubGraph("Host", "Host"); + } + + // Need post order to ensure p2p dest resources are created before their + // source (since source will handle the copy command). + std::unordered_set visited; + std::vector postOrder; + traversePostOrder(root, visited, postOrder); + runtime::PlaceholderUsageMap phUsage; + // Collect placeholders usage count. + for (const auto &cb : bindings) { + runtime::NNPIDeviceManager *nnpiDM = + dynamic_cast(cb.device); + LOG_IF_NOT_RETURN_LLVMERROR(nnpiDM, "Invalid device manager"); + nnpiDM->addPlaceholderUsageCount(cb.networkName, phUsage); + } + + for (const auto &usage : phUsage) { + LOG_IF_NOT_RETURN_LLVMERROR( + usage.second.numWriters < 2, + "Multiple writes to the same placeholder not suported"); + } + + for (auto *dagNode : postOrder) { + if (dagNode->backendName != "NNPI") { + continue; + } + + // Find the contextbinding for this node (assuming there's only one). + ExecutionContext *ctx = nullptr; + runtime::DeviceManager *devMgr = nullptr; + for (auto &cb : bindings) { + if (cb.networkName == dagNode->name) { + ctx = cb.context; + devMgr = cb.device; + break; + } + } + if (ctx && devMgr) { + runtime::NNPIDeviceManager *nnpiDM = + dynamic_cast(devMgr); + LOG_IF_NOT_RETURN_LLVMERROR(nnpiDM, "Invalid device manager bound"); + LOG_IF_NOT_RETURN_LLVMERROR( + !nnpiDM->bindContext(dagNode->name, ctx, phUsage), + "Failed to bind context"); + } + } + + if (backendOptions_.dumpRuntime) { + DotWriter::writeToFile(root->name); + } + + return Error::success(); +} diff --git a/lib/Backends/NNPI/NNPI.h b/lib/Backends/NNPI/NNPI.h index 455be544db..7c12edbb4f 100644 --- a/lib/Backends/NNPI/NNPI.h +++ b/lib/Backends/NNPI/NNPI.h @@ -66,6 +66,10 @@ class NNPIBackend final : public Backend { NNPIDeviceOptions options({}); return options.getSupportedOptions(); }; + + virtual Error bindContexts(llvm::ArrayRef bindings, + const runtime::DAGNode *root, bool enableP2P, + bool enableDRT) override; /// @} private: diff --git a/lib/Backends/NNPI/NNPICompiledFunction.cpp b/lib/Backends/NNPI/NNPICompiledFunction.cpp index ddb3c7071c..97d996c9f0 100644 --- a/lib/Backends/NNPI/NNPICompiledFunction.cpp +++ b/lib/Backends/NNPI/NNPICompiledFunction.cpp @@ -210,9 +210,27 @@ Error NNPICompiledFunction::compile(Function *F, const BackendOptions &opts) { } compilationOptions_ = NNPICompilationOptions(newOpts.backendSpecificOpts); + + if (compilationOptions_.compileOutputPostfix) { + compilationFileName_ = compilationOptions_.compiledFile.get() + "_" + + std::string(F->getName()); + } else { + compilationFileName_ = compilationOptions_.compiledFile.get(); + } + LOG_IF_NOT_RETURN_LLVMERROR( + compilationFileName_.length() < NNPI_MAX_STRING_LEN, "Bad filename"); + NNPIImporter importer(compilationOptions_); network_ = importer.importFunction(F, newOpts); + LOG_IF_INVALID_HANDLE_RETURN_LLVMERROR(network_, "Failed to import function"); + // Setting the network name. + std::string networkName = compilationFileName_; + if (compilationFileName_.empty()) { + networkName = F->getName(); + } + ASSERT_LOG_NNPI_ERROR(nnpiNetworkSetName(network_, networkName.c_str()), + "Failed to set NNPI network name"); // Apply optimizations. NNPIOptimizationConfig optConf; @@ -237,16 +255,32 @@ Error NNPICompiledFunction::compile(Function *F, const BackendOptions &opts) { RETURN_IF_ERR(setupCompilationHints(F, newOpts.backendSpecificNodeInfo)); - if (compilationOptions_.useIceT || compilationOptions_.inferOnDevice) { - if (compilationOptions_.compileOutputPostfix) { - compilationFileName_ = compilationOptions_.compiledFile.get() + "_" + - std::string(F->getName()); - } else { - compilationFileName_ = compilationOptions_.compiledFile.get(); + // Collect input/output names. + { + size_t numInputs, numOutputs; + NNPIObjectName name; + NNPITensorDesc desc; + LOG_NNPI_IF_ERROR_RETURN_LLVMERROR( + nnpiNetworkGetInputNum(network_, &numInputs), + "Failed to query NNPI network inputs"); + for (size_t i = 0; i < numInputs; i++) { + LOG_NNPI_IF_ERROR_RETURN_LLVMERROR( + nnpiNetworkGetInputDesc(network_, i, name, &desc), + "Failed to query NNPI network inputs"); + inputNames_.push_back(name); + } + LOG_NNPI_IF_ERROR_RETURN_LLVMERROR( + nnpiNetworkGetOutputNum(network_, &numOutputs), + "Failed to query NNPI network outputs"); + for (size_t i = 0; i < numOutputs; i++) { + LOG_NNPI_IF_ERROR_RETURN_LLVMERROR( + nnpiNetworkGetOutputDesc(network_, i, name, &desc), + "Failed to query NNPI network outputs"); + outputNames_.push_back(name); } - LOG_IF_NOT_RETURN_LLVMERROR( - compilationFileName_.length() < NNPI_MAX_STRING_LEN, "Bad filename"); + } + if (compilationOptions_.useIceT || compilationOptions_.inferOnDevice) { if (compilationFileName_.empty()) // Compile to memory. { NNPIStream outFileStream; diff --git a/lib/Backends/NNPI/NNPICompiledFunction.h b/lib/Backends/NNPI/NNPICompiledFunction.h index 16c38ff170..ee298672be 100644 --- a/lib/Backends/NNPI/NNPICompiledFunction.h +++ b/lib/Backends/NNPI/NNPICompiledFunction.h @@ -81,6 +81,12 @@ class NNPICompiledFunction final : public CompiledFunction { return compilationFileName_; } + const std::vector &getInputNames() const { return inputNames_; } + + const std::vector &getOutputNames() const { + return outputNames_; + } + private: NNPINetwork network_; NNPICompilationConfig config_; @@ -90,6 +96,8 @@ class NNPICompiledFunction final : public CompiledFunction { std::unordered_set staticInputs_; NNPICompilationOptions compilationOptions_; std::string compilationFileName_; + std::vector inputNames_; + std::vector outputNames_; Error updateCompilationConfigFromOptions( NNPICompilationOptions &compilationOptions); diff --git a/lib/Backends/NNPI/NNPIDeviceManager.cpp b/lib/Backends/NNPI/NNPIDeviceManager.cpp index 39a332bf09..fdad1e9590 100755 --- a/lib/Backends/NNPI/NNPIDeviceManager.cpp +++ b/lib/Backends/NNPI/NNPIDeviceManager.cpp @@ -19,6 +19,7 @@ #include "NNPI.h" #include "NNPICompiledFunction.h" #include "NNPITracing.h" +#include "NNPIUtils.h" #include "glow/Support/Error.h" #include "nnpi_inference.h" #include "nnpi_transformer.h" @@ -63,8 +64,8 @@ NNPIDeviceManager::NNPIDeviceManager( std::shared_ptr deviceOptions, NNPIAdapter adapter, unsigned numInferenceWorkers) : DeviceManager(config), numWorkersPerFunction_(numInferenceWorkers), - deviceId_(config_.deviceID), adapter_(adapter), - device_(NNPI_INVALID_NNPIHANDLE), deviceOptions_(deviceOptions) { + deviceId_(config_.deviceID), device_(NNPI_INVALID_NNPIHANDLE), + deviceOptions_(deviceOptions), adapter_(adapter) { if (deviceOptions_->showVars) { LOG(INFO) << deviceOptions_->dumpStatus(); @@ -319,7 +320,7 @@ void NNPIDeviceManager::transferStaticPlaceholderToDevice( nnpiResource != nullptr, "Static placeholder no longer exists on the device", resultCB); - nnpiResource->UpdateDeviceResourceFromTensor(T, resultCB); + nnpiResource->updateDeviceResourceFromTensor(T, resultCB); }; Error NNPIDeviceManager::startDeviceTrace(TraceContext *traceContext) { @@ -338,5 +339,42 @@ Error NNPIDeviceManager::stopDeviceTrace(TraceContext *traceContext) { return Error::success(); } +Error NNPIDeviceManager::bindContext(std::string functionName, + ExecutionContext *ctx, + PlaceholderUsageMap &phUsage) { + if (deviceOptions_->dumpRuntime) { + DotWriter::addSubGraph(std::to_string(device_), + std::string("Device ") + std::to_string(deviceId_) + + " (" + DotWriter::getHexStr(device_) + ")"); + } + + // Create inference context. + ASSERT_WITH_MSG(inferenceEnvs_.count(functionName), "Invalid function name."); + std::shared_ptr infCtx( + inferenceEnvs_.at(functionName).createDetachedInferenceContext(phUsage)); + ASSERT_WITH_MSG(infCtx, "Failed to create detached context"); + + // Set the inference context into NNPIDeviceBinding and store in the ExCtx. + ctx->setDeviceBindings(std::make_unique(infCtx)); + return Error::success(); +} + +void NNPIDeviceManager::addPlaceholderUsageCount(std::string functionName, + PlaceholderUsageMap &phUsage) { + if (functions_.count(functionName)) { + NNPICompiledFunction *func = + dynamic_cast(functions_.at(functionName)); + ASSERT_WITH_MSG(func, "Invalid function."); + for (auto inputName : func->getInputNames()) { + phUsage[inputName].numReaders++; + phUsage[inputName].devices.insert(device_); + } + for (auto outputName : func->getOutputNames()) { + phUsage[outputName].numWriters++; + phUsage[outputName].devices.insert(device_); + } + } +} + } // namespace runtime } // namespace glow diff --git a/lib/Backends/NNPI/NNPIDeviceManager.h b/lib/Backends/NNPI/NNPIDeviceManager.h index 8d93cb5429..7aee8c6d6b 100644 --- a/lib/Backends/NNPI/NNPIDeviceManager.h +++ b/lib/Backends/NNPI/NNPIDeviceManager.h @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include namespace glow { @@ -34,6 +36,7 @@ class NNPICompiledFunction; namespace runtime { class NNPIResource; +class InferenceContext; using StaticPlaceholderMap = std::unordered_map>; @@ -101,6 +104,29 @@ class NNPIDeviceManager : public DeviceManager { virtual Error startDeviceTrace(TraceContext *traceContext) override; virtual Error stopDeviceTrace(TraceContext *traceContext) override; + Error bindContext(std::string functionName, ExecutionContext *ctx, + PlaceholderUsageMap &phUsage); + void addPlaceholderUsageCount(std::string functionName, + PlaceholderUsageMap &phUsage); +}; + +class NNPIDeviceBindings : public DeviceBindings { +public: + NNPIDeviceBindings(std::shared_ptr &infCtx) + : DeviceBindings("NNPI"), infCtx_(infCtx) {} + + virtual ~NNPIDeviceBindings() {} + + std::unique_ptr clone() override { + return std::make_unique(infCtx_); + } + + std::shared_ptr getInferenceContext() const { + return infCtx_; + } + +private: + std::shared_ptr infCtx_; }; DeviceManager *createNNPIDeviceManager(const DeviceConfig &config, diff --git a/lib/Backends/NNPI/NNPIOptions.h b/lib/Backends/NNPI/NNPIOptions.h index 89582b0cd0..a9c72c6d7f 100644 --- a/lib/Backends/NNPI/NNPIOptions.h +++ b/lib/Backends/NNPI/NNPIOptions.h @@ -148,10 +148,16 @@ class NNPIBackendOptions : public NNPIOptions { "1" #endif ); + /// Dump runtime graph. + DECLARE_NNPI_OPTION(dumpRuntime, bool, "DumpRuntime", + "Dump runtime graph (bindContexts).", "NNPI_DUMP_RUNTIME", + "0"); + NNPIBackendOptions() { INIT_NNPI_OPTIONS(useIceT, llvm::StringMap()); INIT_NNPI_OPTIONS(inferOnDevice, llvm::StringMap()); INIT_NNPI_OPTIONS(showVars, llvm::StringMap()); + INIT_NNPI_OPTIONS(dumpRuntime, llvm::StringMap()); } virtual llvm::StringRef getOptionsName() const override { @@ -332,11 +338,23 @@ class NNPIDeviceOptions : public NNPIOptions { DECLARE_NNPI_OPTION(dumpIOtoFiles, bool, "DumpIOtoFiles", "Dump Inputs/Outputs to files.", "NNPI_DUMP_IO", "0"); /// Force using a specific AVX type. - DECLARE_NNPI_OPTION(avxType, int, "avxType", + DECLARE_NNPI_OPTION(avxType, int, "AvxType", "Force using a specific AVX type." "\n 0 = No AVX. " "\n 1 = Use AVX512. ", "NNPI_AVX_TYPE", "-1"); + /// Disable DRT support. + DECLARE_NNPI_OPTION(disableDRT, bool, "DisableDRT", + "Disable DRT support (copy to/from host instead).", + "NNPI_DISABLE_DRT", "0"); + /// Disable P2P support. + DECLARE_NNPI_OPTION(disableP2P, bool, "DisableP2P", + "Disable P2P support (copy to/from host instead).", + "NNPI_DISABLE_P2P", "0"); + /// Dump runtime graph. + DECLARE_NNPI_OPTION(dumpRuntime, bool, "DumpRuntime", + "Dump runtime graph (bindContexts).", "NNPI_DUMP_RUNTIME", + "0"); NNPIDeviceOptions(const llvm::StringMap ¶meters) { INIT_NNPI_OPTIONS(useIceT, parameters); @@ -349,6 +367,9 @@ class NNPIDeviceOptions : public NNPIOptions { INIT_NNPI_OPTIONS(enabledCommandLists, parameters); INIT_NNPI_OPTIONS(dumpIOtoFiles, parameters); INIT_NNPI_OPTIONS(avxType, parameters); + INIT_NNPI_OPTIONS(disableDRT, parameters); + INIT_NNPI_OPTIONS(disableP2P, parameters); + INIT_NNPI_OPTIONS(dumpRuntime, parameters); if (avxType == -1) { if (isStringFoundInCpuInfo("avx512f")) { diff --git a/lib/Backends/NNPI/NNPIResource.cpp b/lib/Backends/NNPI/NNPIResource.cpp index c92eeb3403..5afc5cc2eb 100644 --- a/lib/Backends/NNPI/NNPIResource.cpp +++ b/lib/Backends/NNPI/NNPIResource.cpp @@ -14,6 +14,7 @@ */ #include "NNPIResource.h" +#include "InferenceContext.h" #include "NNPIUtils.h" #include "nnpi_inference.h" #include @@ -85,6 +86,21 @@ static void DumpToFile(const std::string &filename, void *data, size_t size) { namespace glow { namespace runtime { +static std::shared_ptr +findResourceForDevice(const ResourceUsers &users, NNPIDeviceContext device) { + for (auto reader : users.readers) { + if (reader && reader->getDevice() == device) { + return reader; + } + } + for (auto writer : users.writers) { + if (writer && writer->getDevice() == device) { + return writer; + } + } + return nullptr; +} + NNPIResource::NNPIResource() { adapter_ = NNPI_INVALID_NNPIHANDLE; device_ = NNPI_INVALID_NNPIHANDLE; @@ -98,6 +114,9 @@ NNPIResource::NNPIResource() { usage_ = ResourceUsage::None; deviceOptions_ = nullptr; cmdListIdx_ = UINT32_MAX; + ownsDeviceResource_ = true; + p2pDevice_ = NNPI_INVALID_NNPIHANDLE; + p2pDeviceResource_ = NNPI_INVALID_NNPIHANDLE; } NNPIResource::~NNPIResource() { @@ -105,7 +124,7 @@ NNPIResource::~NNPIResource() { LOG_NNPI_INF_IF_ERROR(nnpiCopyCommandDestroy(copyCommand_), "Failed to destroy NNPI copy command"); } - if (deviceResource_ != NNPI_INVALID_NNPIHANDLE) { + if (ownsDeviceResource_ && (deviceResource_ != NNPI_INVALID_NNPIHANDLE)) { LOG_NNPI_INF_IF_ERROR(nnpiDeviceResourceDestroy(deviceResource_), "Failed to destroy NNPI device resource"); } @@ -121,7 +140,8 @@ bool NNPIResource::init(const NNPIObjectName name, std::shared_ptr deviceOptions, NNPIAdapter adapter, NNPIDeviceContext device, const NNPIResourceDesc *desc, - NNPIResource::ResourceUsage usage) { + NNPIResource::ResourceUsage usage, + PlaceholderUsageMap *phUsage) { if (name == nullptr || desc == nullptr || deviceOptions == nullptr) { return false; } @@ -150,6 +170,29 @@ bool NNPIResource::init(const NNPIObjectName name, return true; } + if (deviceOptions_->disableDRT) { + switch (usage_) { + case ResourceUsage::DRTInput: + usage_ = ResourceUsage::InputResource; + break; + case ResourceUsage::DRTOutput: + usage_ = ResourceUsage::OutputResource; + break; + default:; // Do nothing. + } + } + if (deviceOptions_->disableP2P) { + switch (usage_) { + case ResourceUsage::P2PInput: + usage_ = ResourceUsage::InputResource; + break; + case ResourceUsage::P2POutput: + usage_ = ResourceUsage::OutputResource; + break; + default:; // Do nothing. + } + } + // Create host resource (pinned and aligned allocation). NNPIResourceDesc hostResDesc = desc_; // Make a copy of the desc to overwrite attributes. @@ -157,8 +200,9 @@ bool NNPIResource::init(const NNPIObjectName name, hostResDesc.hostAttrib.locklessExecution = 1; // Set host resource to lockless. } - if (usage != ResourceUsage::StaticInputResource) { - // No host resource needed for static inputs. + switch (usage_) { + case ResourceUsage::InputResource: + case ResourceUsage::OutputResource: LOG_NNPI_INF_IF_ERROR_RETURN_FALSE( nnpiHostResourceCreate(adapter_, &hostResDesc, &hostResource_), "Failed to create NNPI host resource"); @@ -171,12 +215,83 @@ bool NNPIResource::init(const NNPIObjectName name, memset(hostPtr_, 0, CalcDescSize(&desc_)); // Clear host resource to zero for compile // only path (USE_ICE_T). + break; + case ResourceUsage::StaticInputResource: + case ResourceUsage::P2PInput: + case ResourceUsage::P2POutput: + case ResourceUsage::DRTInput: + case ResourceUsage::DRTOutput: + // No host resource is needed + break; + default: + LOG_AND_RETURN_IF_NOT(ERROR, 0, "Invalid usage", false); } // Create device resource. - LOG_NNPI_INF_IF_ERROR_RETURN_FALSE( - nnpiDeviceResourceCreate(device_, &desc_, &deviceResource_), - "Failed to create NNPI device resource"); + bool allocateDeviceResource = false; + + ResourceUsers *users = nullptr; + shared_ptr sharedResource = nullptr; + if (phUsage) { + users = &(phUsage->at(name_)); + LOG_AND_RETURN_IF_NOT(ERROR, users, "Invalid resource users", false); + sharedResource = findResourceForDevice(*users, device_); + } + + switch (usage_) { + case ResourceUsage::InputResource: + case ResourceUsage::OutputResource: + // Normal create. + allocateDeviceResource = true; + break; + + case ResourceUsage::StaticInputResource: + case ResourceUsage::DRTInput: + // Potential shared (allocate if doesnt exist). + if (sharedResource) { + // If exists on device - share it. + deviceResource_ = sharedResource->getDeviceResource(); + ownsDeviceResource_ = false; + allocateDeviceResource = false; + } else { + allocateDeviceResource = true; + } + break; + + case ResourceUsage::P2PInput: + // Create p2p input. + desc_.deviceAttrib.p2pUsage = NNPI_P2P_USAGE_DST; + desc_.deviceAttrib.p2pDepth = 1; + allocateDeviceResource = true; + break; + + case ResourceUsage::P2POutput: + // Create p2p output. + desc_.deviceAttrib.p2pUsage = NNPI_P2P_USAGE_SRC; + desc_.deviceAttrib.p2pDepth = 1; + allocateDeviceResource = true; + break; + + case ResourceUsage::DRTOutput: + // Must be shared (creation order maintains readers allocated before + // writers). + LOG_AND_RETURN_IF_NOT( + ERROR, sharedResource, + "Missing DRT resource (should have been created already)", false); + deviceResource_ = sharedResource->getDeviceResource(); + ownsDeviceResource_ = false; + allocateDeviceResource = false; + break; + + default: + LOG_AND_RETURN_IF_NOT(ERROR, 0, "Invalid usage", false); + } + + if (allocateDeviceResource) { + LOG_NNPI_INF_IF_ERROR_RETURN_FALSE( + nnpiDeviceResourceCreate(device_, &desc_, &deviceResource_), + "Failed to create NNPI device resource"); + } // Create copy command. switch (usage_) { @@ -192,8 +307,40 @@ bool NNPIResource::init(const NNPIObjectName name, deviceResource_, ©Command_), "Failed to create NNPI copy command (output)"); break; + case ResourceUsage::P2POutput: + LOG_AND_RETURN_IF_NOT(ERROR, users, "Missing resource users", false); + for (auto reader : users->readers) { + if (reader && reader->getDevice() != device) { + p2pDevice_ = reader->getDevice(); + p2pDeviceResource_ = reader->getDeviceResource(); + } + } + LOG_AND_RETURN_IF_NOT(ERROR, + (p2pDevice_ != NNPI_INVALID_NNPIHANDLE) && + (p2pDeviceResource_ != NNPI_INVALID_NNPIHANDLE), + "Can't find p2p counterpart", false); + LOG_NNPI_INF_IF_ERROR_RETURN_FALSE( + nnpiCopyCommandCreateDeviceToDevice(p2pDevice_, p2pDeviceResource_, + device_, deviceResource_, + ©Command_), + "Failed to create NNPI copy command (p2p output)"); + break; + case ResourceUsage::P2PInput: + // The device resource is copied by the writer in the + // preceding context. + break; case ResourceUsage::StaticInputResource: - // Fallthrough. + // The device resource doesn't need to be updated before + // inference. + break; + case ResourceUsage::DRTInput: + // The device resource doesn't need to be updated before + // inference. + break; + case ResourceUsage::DRTOutput: + // The device resource doesn't need to be updated before + // inference. + break; case ResourceUsage::None: // Do nothing - no copy command needed. break; @@ -202,10 +349,11 @@ bool NNPIResource::init(const NNPIObjectName name, return false; } + DBG(__FUNCTION__ << dump()); return true; } -NNPIInferenceErrorCode NNPIResource::PreInference(Tensor *t, +NNPIInferenceErrorCode NNPIResource::preInference(Tensor *t, bool partialTensor) { if (usage_ != ResourceUsage::InputResource) { // Nothing to do here yet. @@ -244,7 +392,7 @@ NNPIInferenceErrorCode NNPIResource::PreInference(Tensor *t, return NNPI_INF_NO_ERROR; } -NNPIInferenceErrorCode NNPIResource::PostInference(Tensor *t) { +NNPIInferenceErrorCode NNPIResource::postInference(Tensor *t) { if (usage_ != ResourceUsage::OutputResource) { // Nothing to do here yet. return NNPI_INF_NO_ERROR; @@ -285,7 +433,7 @@ NNPIInferenceErrorCode NNPIResource::PostInference(Tensor *t) { return NNPI_INF_NO_ERROR; } -bool NNPIResource::UpdateResourceDescFromTensorDesc( +bool NNPIResource::updateResourceDescFromTensorDesc( NNPIResourceDesc *rDesc, const NNPITensorDesc *tDesc) { if (tDesc == nullptr || rDesc == nullptr) { return false; @@ -325,7 +473,7 @@ bool NNPIResource::UpdateResourceDescFromTensorDesc( return true; } -void NNPIResource::UpdateDeviceResourceFromTensor( +void NNPIResource::updateDeviceResourceFromTensor( Tensor *t, std::function resultCB) { LOG_AND_FAIL_CALLBACK_IF_NOT( t != nullptr, "Invalid tensor used to update static input", resultCB); @@ -333,10 +481,12 @@ void NNPIResource::UpdateDeviceResourceFromTensor( LOG_AND_FAIL_CALLBACK_IF_NOT(updateHostResourceFromTensor(t, false), "Invalid Static placeholder", resultCB); - LOG_NNPI_INF_IF_ERROR(nnpiDeviceResourceSubLoad(deviceResource_, 0, - t->getUnsafePtr(), - t->getSizeInBytes()), - "Failed to execute device resource sub load"); + if (deviceOptions_->inferOnDevice) { + LOG_AND_CALLBACK_NNPI_INF_IF_ERROR( + nnpiDeviceResourceSubLoad(deviceResource_, 0, t->getUnsafePtr(), + t->getSizeInBytes()), + "Failed to execute device resource sub load", resultCB); + } resultCB(Error::success()); } @@ -356,9 +506,10 @@ bool NNPIResource::updateHostResourceFromTensor(Tensor *t, bool partialTensor) { LOG_AND_RETURN_IF(ERROR, partialData, "Static placeholders are not allowed to do partial copy", false); - - // nothing else to do for static placeholders. - return true; + if (deviceOptions_->inferOnDevice) { + // nothing else to do for static placeholders when running on device + return true; + } } if (downcastInt64) { @@ -398,9 +549,9 @@ bool NNPIResource::updateHostResourceFromTensor(Tensor *t, bool partialTensor) { return true; } -std::string NNPIResource::Dump() const { +std::string NNPIResource::dump() const { std::stringstream stream; - stream << "NNPIResource: " << name_; + stream << "NNPIResource: \"" << name_ << '"'; stream << ", DescSize: " << CalcDescSize(&desc_); stream << ", Usage: " << static_cast(usage_); stream << ", Adapter: " << adapter_; @@ -411,6 +562,9 @@ std::string NNPIResource::Dump() const { stream << ", CopyCommand: " << copyCommand_; stream << ", CommandListIndex: " << cmdListIdx_; stream << ", PartialSize: " << partialSize_; + stream << ", ownsDeviceResource: " << ownsDeviceResource_; + stream << ", p2pDevice: " << p2pDevice_; + stream << ", p2pDeviceResource: " << p2pDeviceResource_; return stream.str(); } diff --git a/lib/Backends/NNPI/NNPIResource.h b/lib/Backends/NNPI/NNPIResource.h index 905e1279de..2f1d207721 100644 --- a/lib/Backends/NNPI/NNPIResource.h +++ b/lib/Backends/NNPI/NNPIResource.h @@ -27,6 +27,15 @@ class Tensor; namespace runtime { +class NNPIResource; +struct ResourceUsers { + unsigned numReaders = 0, numWriters = 0; + std::vector> writers; + std::vector> readers; + std::unordered_set devices; +}; +using PlaceholderUsageMap = std::unordered_map; + /// This class Holds metadata for an inference resource. class NNPIResource { public: @@ -36,6 +45,10 @@ class NNPIResource { InputResource, OutputResource, StaticInputResource, + P2PInput, + P2POutput, + DRTInput, + DRTOutput, }; /// Constructor. @@ -46,41 +59,45 @@ class NNPIResource { /// handle. NNPIResource(const NNPIResource &) = delete; /// Update a device resource contents from a provided tensor. - void UpdateDeviceResourceFromTensor(Tensor *t, + void updateDeviceResourceFromTensor(Tensor *t, std::function resultCB); /// Initialize a resource. bool init(const NNPIObjectName name, std::shared_ptr deviceOptions, NNPIAdapter adapter, NNPIDeviceContext device, - const NNPIResourceDesc *desc, ResourceUsage usage); + const NNPIResourceDesc *desc, ResourceUsage usage, + PlaceholderUsageMap *phUsage = nullptr); /// Pre-inference processing on the resource. - NNPIInferenceErrorCode PreInference(Tensor *t, bool partialTensor); + NNPIInferenceErrorCode preInference(Tensor *t, bool partialTensor); /// Post-inference processing on the resource. - NNPIInferenceErrorCode PostInference(Tensor *t); + NNPIInferenceErrorCode postInference(Tensor *t); /// Getters. - inline const NNPIObjectName &GetName() const { return name_; } - inline NNPIDeviceContext GetDevice() const { return device_; } - inline const NNPIResourceDesc &GetDesc() const { return desc_; } - inline NNPIDeviceResource GetDeviceResource() const { + inline const NNPIObjectName &getName() const { return name_; } + inline NNPIDeviceContext getDevice() const { return device_; } + inline const NNPIResourceDesc &getDesc() const { return desc_; } + inline NNPIDeviceResource getDeviceResource() const { return deviceResource_; } - inline NNPIHostResource GetHostResource() const { return hostResource_; } - inline void *GetHostPtr() const { return hostPtr_; } - inline NNPICopyCommand GetCopyCommand() const { return copyCommand_; } - inline uint32_t GetCmdListIdx() const { return cmdListIdx_; } - inline void SetCmdListIdx(uint32_t idx) { cmdListIdx_ = idx; } - inline uint64_t GetPartialSize() const { return partialSize_; } - inline ResourceUsage GetUsage() const { return usage_; } + inline NNPIHostResource getHostResource() const { return hostResource_; } + inline void *getHostPtr() const { return hostPtr_; } + inline NNPICopyCommand getCopyCommand() const { return copyCommand_; } + inline uint32_t getCmdListIdx() const { return cmdListIdx_; } + inline void setCmdListIdx(uint32_t idx) { cmdListIdx_ = idx; } + inline uint64_t getPartialSize() const { return partialSize_; } + inline ResourceUsage getUsage() const { return usage_; } + inline NNPIDeviceResource getP2PDeviceResource() const { + return p2pDeviceResource_; + } /// Update a given NNPIResourceDesc struct from the data in an NNPITensorDesc /// struct. - static bool UpdateResourceDescFromTensorDesc(NNPIResourceDesc *rDesc, + static bool updateResourceDescFromTensorDesc(NNPIResourceDesc *rDesc, const NNPITensorDesc *tDesc); - /// Dump the state of a resource object - std::string Dump() const; + /// Dump the state of a resource object. + std::string dump() const; private: NNPIAdapter adapter_; // This handle isn't owned by the object. @@ -96,6 +113,11 @@ class NNPIResource { std::shared_ptr deviceOptions_; std::vector refStorage_; uint32_t cmdListIdx_; + bool ownsDeviceResource_; // Used for DRT (only one NNPIResource will own the + // device resource) + NNPIDeviceContext p2pDevice_; // the other device used in p2p + NNPIDeviceResource + p2pDeviceResource_; // the resource on the other device used in p2p. /// Update the owned host resource with data taken from the given tensor. // return true when successfull, false otherwise. diff --git a/lib/Backends/NNPI/NNPIUtils.cpp b/lib/Backends/NNPI/NNPIUtils.cpp new file mode 100644 index 0000000000..134aff99aa --- /dev/null +++ b/lib/Backends/NNPI/NNPIUtils.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) Glow Contributors. See CONTRIBUTORS file. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "NNPIUtils.h" +#include "DebugMacros.h" +#include +#include + +unsigned DotWriter::graphId_(0); +std::map> DotWriter::subGraphNodes_; +std::map DotWriter::subGraphLabels_; +std::set DotWriter::edges_; + +static const std::string &getColorString(unsigned i) { + // Taking colors from the SVG scheme + static const std::vector nodeColors = { + "mistyrose", // function + "lightgreen", // host resource + "lightblue", // normal device resource + "plum", // static device resource + "lightcoral", // p2p device resource + "wheat", // drt device resource + "lightgray", // reserved + "sandybrown", // reserved + "turquoise", // reserved + "seagreen", // reserved + }; + return nodeColors.at(i % nodeColors.size()); +} + +void DotWriter::clear() { + DotWriter::subGraphNodes_.clear(); + DotWriter::subGraphLabels_.clear(); + DotWriter::edges_.clear(); +} + +void DotWriter::addNode(std::string name, std::string label, unsigned color, + std::string subGraph) { + ostringstream os; + os << name << " [\n"; + os << "\tlabel = \"" << label << "\"\n"; + os << "\tstyle=\"filled,rounded\"\n"; + os << "\tfillcolor=" << getColorString(color) << "\n"; + os << "];\n"; + if (!subGraph.empty()) { + subGraphNodes_[subGraph].insert(os.str() /*name*/); + } +} + +void DotWriter::addEdge(std::string src, std::string dst) { + edges_.insert(src + " -> " + dst + ";\n"); +} + +void DotWriter::writeToFile(std::string filename) { + if (filename.empty()) { + filename = "dot_graph"; + } + filename = filename + std::to_string(graphId_++) + ".dot"; + std::ofstream outFile(filename); + if (!outFile.is_open()) { + LOG(INFO) << "Failed to write dor file: " << filename; + return; + } + outFile << "digraph {\n"; + outFile << "\tedge[color = black];\n"; + outFile << "\trank = TB;\n"; + outFile << "\tnode[shape = Mrecord, penwidth=2];\n"; + outFile << "\n"; + + for (const auto &sg : subGraphNodes_) { + outFile << "subgraph " + << "cluster_" << sg.first << " {\n"; + outFile << "\tlabel = \"" << subGraphLabels_.at(sg.first) << "\";\n"; + for (const auto &n : sg.second) { + outFile << n; //<< ";\n"; + } + outFile << "}\n"; + } + for (const auto &e : edges_) { + outFile << e; + } + + outFile << "\n"; + outFile << "\t}\n"; +} + +void DotWriter::addSubGraph(std::string name, std::string label) { + subGraphLabels_[name] = label; +} + +std::string DotWriter::getHexStr(uint64_t h) { + std::ostringstream os; + os << std::hex << h; + return os.str(); +} \ No newline at end of file diff --git a/lib/Backends/NNPI/NNPIUtils.h b/lib/Backends/NNPI/NNPIUtils.h index afd1dd9950..4922237c71 100644 --- a/lib/Backends/NNPI/NNPIUtils.h +++ b/lib/Backends/NNPI/NNPIUtils.h @@ -17,6 +17,10 @@ #define GLOW_BACKENDS_NNPI_NNPIUTILS_H #include +#include +#include +#include +#include using namespace std; @@ -31,4 +35,24 @@ inline void convertI64toI32(int64_t const *i64Data, int32_t *i32Data, void convertI64toI32_AVX512(int64_t const *i64Data, int32_t *i32Data, uint32_t elements); +// Static Dot writer (not thread safe). +class DotWriter { +public: + static void clear(); + static void addNode(std::string name, std::string label, unsigned color = 0, + std::string subGraph = {}); + static void addEdge(std::string src, std::string dst); + static void writeToFile(std::string filename = {}); + static void addSubGraph(std::string name, std::string label); + static std::string getHexStr(uint64_t h); + +private: + DotWriter() {} // Should only be used in a static fashion. + + static unsigned graphId_; + static std::map> subGraphNodes_; + static std::map subGraphLabels_; + static std::set edges_; +}; + #endif // GLOW_BACKENDS_NNPI_NNPIUTILS_H diff --git a/lib/Backends/NNPI/tests/NNPIDeviceManagerTest.cpp b/lib/Backends/NNPI/tests/NNPIDeviceManagerTest.cpp index df8d839dac..57d182401c 100644 --- a/lib/Backends/NNPI/tests/NNPIDeviceManagerTest.cpp +++ b/lib/Backends/NNPI/tests/NNPIDeviceManagerTest.cpp @@ -26,8 +26,6 @@ struct BlacklistInitializer { {"MultiFunction/0", TestBlacklist::AnyDeviceAnyEngine}, {"DeviceResidentTensors/0", TestBlacklist::AnyDeviceAnyEngine}, {"AvailableMemory/0", TestBlacklist::AnyDeviceAnyEngine}, - {"TransferStaticPlaceholderTest/0", - TestBlacklist::AnyDeviceSWEngine}, {"CanHandleDeviceResidentTensors/0", TestBlacklist::AnyDeviceAnyEngine}, }; diff --git a/lib/Backends/NNPI/tests/NNPIHostManagerTest.cpp b/lib/Backends/NNPI/tests/NNPIHostManagerTest.cpp index e77df7cdc2..747b2f75f2 100644 --- a/lib/Backends/NNPI/tests/NNPIHostManagerTest.cpp +++ b/lib/Backends/NNPI/tests/NNPIHostManagerTest.cpp @@ -29,8 +29,7 @@ struct BlacklistInitializer { {"testStaticAssignmentP2POnly/0", TestBlacklist::AnyDeviceAnyEngine}, {"testStaticAssignmentDeviceResidentTensorOnly/0", TestBlacklist::AnyDeviceAnyEngine}, - {"testStaticAssignment/0", TestBlacklist::AnyDeviceAnyEngine}, - {"runNetworkConcurrent/0", TestBlacklist::AnyDeviceAnyEngine}, + {"testStaticAssignment/0", TestBlacklist::AnyDeviceHWEngine}, {"ConcurrentAddRemoveUnique/0", TestBlacklist::AnyDeviceAnyEngine}, {"ConcurrentAddRemoveUnique/0", TestBlacklist::AnyDeviceAnyEngine}, {"testPartitionConfigReplication/0", diff --git a/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp b/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp index a7fb445fe6..8a4a6c343d 100644 --- a/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp +++ b/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp @@ -247,12 +247,10 @@ struct BlacklistInitializer { TestBlacklist::AnyDeviceAnyEngine}, {"convTransposeConvolutionCompareSimpleK5S1P2I3/0", TestBlacklist::AnyDeviceAnyEngine}, - {"ChannelwiseQuantizedGroupConvolution/0", - TestBlacklist::AnyDeviceAnyEngine}, {"ChannelwiseQuantizedGroupConvolution3D/0", TestBlacklist::AnyDeviceAnyEngine}, {"ChannelwiseQuantizedGroupConvolutionNonZero/0", - TestBlacklist::AnyDeviceAnyEngine}, + TestBlacklist::AnyDeviceSWEngine}, }; TestBlacklist::prepareBlacklist(testBlacklistedSetups, backendTestBlacklist);