diff --git a/lib/Backends/NNPI/CMakeLists.txt b/lib/Backends/NNPI/CMakeLists.txt index 9363801da0..b7fc9be440 100644 --- a/lib/Backends/NNPI/CMakeLists.txt +++ b/lib/Backends/NNPI/CMakeLists.txt @@ -97,14 +97,6 @@ message(STATUS "[NNPI] NNPI_MG_LIB_DIR = ${NNPI_MG_LIB}") message(STATUS "[NNPI] GLOW_BINARY_DIR = ${GLOW_BINARY_DIR}") message(STATUS "[NNPI] NNPI_COLLECT_MEM_USAGE = ${NNPI_MEM_PROFILING}") -if (UNIX) - EXEC_PROGRAM(cat ARGS "/proc/cpuinfo" OUTPUT_VARIABLE CPUINFO) - STRING(REGEX REPLACE "^.*(avx512f).*$" "\\1" AVX512F_FLAG ${CPUINFO}) - if("avx512f" STREQUAL "${AVX512F_FLAG}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512vl -mavx512bw") - add_definitions(-DUSE_AVX) - endif() -endif() add_subdirectory(ClassGen) @@ -115,6 +107,8 @@ include_directories( ${NNPI_MG_API} ) +SET_SOURCE_FILES_PROPERTIES(NNPIUtils_AVX512.cpp PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512vl -mavx512bw") + add_library(NNPI NNPI.cpp NNPICompiledFunction.cpp @@ -128,6 +122,8 @@ add_library(NNPI NNPIMLTraceWrapper.cpp InferenceContext.cpp NNPIResource.cpp + NNPIUtils_AVX512.cpp + NNPIAdapterContainer.cpp ) target_link_libraries(NNPI diff --git a/lib/Backends/NNPI/Importer.cpp b/lib/Backends/NNPI/Importer.cpp index cf1be73442..d8cc41c483 100644 --- a/lib/Backends/NNPI/Importer.cpp +++ b/lib/Backends/NNPI/Importer.cpp @@ -39,6 +39,22 @@ static std::string nodeValueName(const glow::NodeValue &nv) { std::to_string(nv.getResNo()); } +static inline NNPIErrorCode +convertLengthsModeToLengthType(glow::LengthsMode mode, + NNPI_LENGTH_TYPE &lengthType) { + switch (mode) { + case LengthsMode::Variable: + lengthType = NNPI_LENGTH_VARIABLE; + break; + case LengthsMode::AllOne: + lengthType = NNPI_LENGTH_ALL_ONE; + break; + default: + return NNPI_INVALID_PARAM; + } + return NNPI_NO_ERROR; +} + glow::NNPIImporter::NNPIImporter(const NNPICompilationOptions &compileOptions) : internalNameCounter_(0), network_(NNPI_INVALID_NNPIHANDLE), compileOptions_(compileOptions) { @@ -318,7 +334,6 @@ void glow::NNPIImporter::updateDescQuantFromGlow( offsetTensor.c_str(), sizeof(desc.quantParams.params.gemmlowpPCQ.offsetsTensor)); } - } else { desc.quantParams.type = NNPI_QUANTIZATION_GEMMLOWP; desc.quantParams.params.gemlowp.scale = t.getScale(); @@ -637,6 +652,35 @@ class PoolNodeImporter : public INNPINodeImporter { } }; +template +class AdaptivePoolNodeImporter : public INNPINodeImporter { +public: + NNPIErrorCode importNode(Node *n, NNPIImporter &importer) override { + auto *glowPool = llvm::dyn_cast(n); + LOG_AND_RETURN_IF_NOT(ERROR, glowPool, "Bad node type", NNPI_INVALID_PARAM); + + // Overwrite input/output values for layout. + LOG_NNPI_IF_ERROR_RETURN_VALUE( + importer.addValue(nodeValueName(glowPool->getInput()), + glowPool->getInput().getType(), + /* alternativeLayout */ true), + "Failed to add tensor to NNPI"); + LOG_NNPI_IF_ERROR_RETURN_VALUE( + importer.addValue(nodeValueName(glowPool->getResult()), + glowPool->getResult().getType(), + /* alternativeLayout */ true), + "Failed to add tensor to NNPI"); + + importer.setUsedTensors({nodeValueName(glowPool->getInput())}, + {nodeValueName(glowPool->getResult())}); + + return nnpiNetworkAddAdaptivePoolingOp( + importer.getNetwork(), glowPool->getName().begin(), + nodeValueName(glowPool->getInput()).c_str(), + nodeValueName(glowPool->getResult()).c_str(), poolType); + } +}; + class FullyConnectedNodeImporter : public INNPINodeImporter { public: NNPIErrorCode importNode(Node *n, NNPIImporter &importer) override { @@ -1101,67 +1145,82 @@ class SplatNodeImporter : public INNPINodeImporter { importer.setUsedTensors({}, {nodeValueName(glowSplat->getResult())}); auto *destType = glowSplat->getResult().getType(); - // Create a constant tensor instead of a Splat (Tile) node. - NNPITensorDesc desc; - desc.attributes.value = 0; - desc.attributes.constant = 1; - - importer.updateDescQuantFromGlow(*destType, desc); - importer.updateDescDimsFromGlow(destType->dims(), desc); - - uint8_t *pData = new uint8_t[destType->getSizeInBytes()]; - uint8_t elem[8]; // Assuming no element larger than 8 bytes. - LOG_AND_RETURN_IF_NOT(ERROR, destType->getElementSize() <= 8, - "Bad dimansion", NNPI_INVALID_DIMS); - - switch (destType->getElementType()) { - case glow::ElemKind::FloatTy: { - float val = glowSplat->getValue(); - std::memcpy(elem, &val, sizeof(float)); - } break; - case glow::ElemKind::Float16Ty: { - float16_t val = glowSplat->getValue(); - std::memcpy(elem, &val, sizeof(float16_t)); - } break; - case glow::ElemKind::Int8QTy: { - float qfVal = round((glowSplat->getValue() / destType->getScale()) + - destType->getOffset()); - int8_t qVal = qfVal < static_cast(INT8_MIN) - ? INT8_MIN - : qfVal > static_cast(INT8_MAX) - ? INT8_MAX - : static_cast(qfVal); - std::memcpy(elem, &qVal, sizeof(int8_t)); - } break; - case glow::ElemKind::BoolTy: - elem[0] = (glowSplat->getValue() != 0); - break; - case glow::ElemKind::Int32ITy: { - int32_t val = static_cast(glowSplat->getValue()); - std::memcpy(elem, &val, sizeof(int32_t)); - } break; - case glow::ElemKind::Int64ITy: { - int64_t val = static_cast(glowSplat->getValue()); - std::memcpy(elem, &val, sizeof(int64_t)); - } break; - default: - LOG_AND_RETURN_IF_NOT(ERROR, 0, "Unhandled ElemKind for Splat output.", - NNPI_INVALID_PARAM); - return NNPI_NOT_IMPLEMENTED; + int32_t numDims = static_cast(destType->dims().size()); + float glowSplatValue = glowSplat->getValue(); + + std::vector finalShapeFilledWithOnes(numDims, 1); + + auto tileInputTensorName = NNPIImporter::internalName_ + + glowSplat->getName().str() + "_Tile_input"; + + if (destType->getElementType() != ElemKind::FloatTy) { + NNPITensorDesc convertInputDesc; + convertInputDesc.attributes.value = 0; + convertInputDesc.attributes.constant = 1; + convertInputDesc.quantParams.precision = NNPI_PRECISION_FLOAT32; + convertInputDesc.quantParams.type = NNPI_QUANTIZATION_NONE; + importer.updateDescDimsFromGlow(finalShapeFilledWithOnes, + convertInputDesc); + + auto convertInputTensorName = NNPIImporter::internalName_ + + glowSplat->getName().str() + + "_Tile_Convert_input"; + LOG_NNPI_IF_ERROR_RETURN_VALUE(importer.addTensor(convertInputTensorName, + convertInputDesc, + &glowSplatValue), + "Failed to add tensor"); + + auto convertName = NNPIImporter::internalName_ + + glowSplat->getName().str() + "_Tile_Convert"; + Type convertOutputType = + Type::newShape(*destType, finalShapeFilledWithOnes); + LOG_NNPI_IF_ERROR_RETURN_VALUE( + importer.addValue(tileInputTensorName, &convertOutputType), + "Failed to add value"); + + LOG_NNPI_IF_ERROR_RETURN_VALUE( + nnpiNetworkAddConvertOp(importer.getNetwork(), convertName.c_str(), + convertInputTensorName.c_str(), + tileInputTensorName.c_str()), + "Failed to add layer"); + } else { + NNPITensorDesc tileInputDesc; + tileInputDesc.attributes.value = 0; + tileInputDesc.attributes.constant = 1; + tileInputDesc.quantParams.precision = NNPI_PRECISION_FLOAT32; + tileInputDesc.quantParams.type = NNPI_QUANTIZATION_NONE; + importer.updateDescDimsFromGlow(finalShapeFilledWithOnes, tileInputDesc); + + LOG_NNPI_IF_ERROR_RETURN_VALUE(importer.addTensor(tileInputTensorName, + tileInputDesc, + &glowSplatValue), + "Failed to add tensor"); } - auto destSize(destType->size()); - auto elemSize(destType->getElementSize()); - for (size_t i = 0; i < destSize; i++) { - for (size_t j = 0; j < elemSize; j++) { - pData[i * elemSize + j] = elem[j]; - } + NNPITensorDesc repeatsDesc; + repeatsDesc.attributes.value = 0; + repeatsDesc.attributes.constant = 1; + repeatsDesc.quantParams.precision = NNPI_PRECISION_INT32; + repeatsDesc.quantParams.type = NNPI_QUANTIZATION_NONE; + importer.updateDescDimsFromGlow({destType->dims().size()}, repeatsDesc); + auto repeatsTensorName = NNPIImporter::internalName_ + + glowSplat->getName().str() + "_Tile_repeats"; + std::vector dims; + for (int i = 0; i < numDims; i++) { + dims.push_back(destType->dims()[i]); } - auto res = - importer.addTensor(nodeValueName(glowSplat->getResult()), desc, pData); - delete[] pData; - return res; + LOG_NNPI_IF_ERROR_RETURN_VALUE( + importer.addTensor(repeatsTensorName, repeatsDesc, dims.data()), + "Failed to add tensor"); + + auto tileNodeName = + NNPIImporter::internalName_ + glowSplat->getName().str() + "_Tile"; + + return nnpiNetworkAddTileOp(importer.getNetwork(), tileNodeName.c_str(), + tileInputTensorName.c_str(), + repeatsTensorName.c_str(), + nodeValueName(glowSplat->getResult()).c_str()); } }; @@ -1179,12 +1238,20 @@ class SLSNodeImporter : public INNPINodeImporter { }, {nodeValueName(glowSLS->getResult())}); + NNPI_LENGTH_TYPE lengthType; + LOG_AND_RETURN_IF_NOT( + ERROR, + convertLengthsModeToLengthType(glowSLS->getLengthsMode(), lengthType) == + NNPI_NO_ERROR, + "Unhandled SLS length type", NNPI_INVALID_PARAM); + return nnpiNetworkAddSparseLengthsWeightedSumOp( importer.getNetwork(), glowSLS->getName().begin(), nodeValueName(glowSLS->getData()).c_str(), nodeValueName(glowSLS->getResult()).c_str(), NULL, nodeValueName(glowSLS->getIndices()).c_str(), - nodeValueName(glowSLS->getLengths()).c_str(), false); + nodeValueName(glowSLS->getLengths()).c_str(), false, false, + glowSLS->getAvgLength(), lengthType); } }; @@ -1203,13 +1270,103 @@ class SLWSNodeImporter : public INNPINodeImporter { }, {nodeValueName(glowSLWS->getResult())}); + NNPI_LENGTH_TYPE lengthType; + LOG_AND_RETURN_IF_NOT( + ERROR, + convertLengthsModeToLengthType(glowSLWS->getLengthsMode(), + lengthType) == NNPI_NO_ERROR, + "Unhandled SLS length type", NNPI_INVALID_PARAM); + return nnpiNetworkAddSparseLengthsWeightedSumOp( importer.getNetwork(), glowSLWS->getName().begin(), nodeValueName(glowSLWS->getData()).c_str(), nodeValueName(glowSLWS->getResult()).c_str(), nodeValueName(glowSLWS->getWeights()).c_str(), nodeValueName(glowSLWS->getIndices()).c_str(), - nodeValueName(glowSLWS->getLengths()).c_str(), false); + nodeValueName(glowSLWS->getLengths()).c_str(), false, false, + glowSLWS->getAvgLength(), lengthType); + } +}; + +class EmbeddingBagNodeImporter : public INNPINodeImporter { +public: + NNPIErrorCode importNode(Node *n, NNPIImporter &importer) override { + auto *glowEmbeddingBag = llvm::dyn_cast(n); + LOG_AND_RETURN_IF_NOT(ERROR, glowEmbeddingBag, "Bad node type", + NNPI_INVALID_PARAM); + + bool hasEndOffset = glowEmbeddingBag->getHasEndOffset(); + LOG_AND_RETURN_IF_NOT(ERROR, hasEndOffset, + "[EmbeddingBag] hasEndOffset must be true", + NNPI_INVALID_PARAM); + + importer.setUsedTensors( + { + nodeValueName(glowEmbeddingBag->getData()), + nodeValueName(glowEmbeddingBag->getWeights()), + nodeValueName(glowEmbeddingBag->getIndices()), + nodeValueName(glowEmbeddingBag->getOffsets()), + }, + {nodeValueName(glowEmbeddingBag->getResult())}); + + NNPI_LENGTH_TYPE lengthType; + LOG_AND_RETURN_IF_NOT( + ERROR, + convertLengthsModeToLengthType(glowEmbeddingBag->getLengthsMode(), + lengthType) == NNPI_NO_ERROR, + "Unhandled SLS length type", NNPI_INVALID_PARAM); + + return nnpiNetworkAddSparseLengthsWeightedSumOp( + importer.getNetwork(), glowEmbeddingBag->getName().begin(), + nodeValueName(glowEmbeddingBag->getData()).c_str(), + nodeValueName(glowEmbeddingBag->getResult()).c_str(), + nodeValueName(glowEmbeddingBag->getWeights()).c_str(), + nodeValueName(glowEmbeddingBag->getIndices()).c_str(), + nodeValueName(glowEmbeddingBag->getOffsets()).c_str(), false, true, + glowEmbeddingBag->getAvgLength(), lengthType); + } +}; + +class EmbeddingBagByteRowwiseOffsetsNodeImporter : public INNPINodeImporter { +public: + NNPIErrorCode importNode(Node *n, NNPIImporter &importer) override { + auto *glowEBBRO = llvm::dyn_cast(n); + LOG_AND_RETURN_IF_NOT(ERROR, glowEBBRO, "Bad node type", + NNPI_INVALID_PARAM); + + bool hasEndOffset = glowEBBRO->getHasEndOffset(); + LOG_AND_RETURN_IF_NOT(ERROR, hasEndOffset, + "[EmbeddingBag] hasEndOffset must be true", + NNPI_INVALID_PARAM); + + importer.setUsedTensors( + { + nodeValueName(glowEBBRO->getData()), + nodeValueName(glowEBBRO->getWeights()), + nodeValueName(glowEBBRO->getIndices()), + nodeValueName(glowEBBRO->getOffsets()), + }, + {nodeValueName(glowEBBRO->getResult())}); + + bool usFp32Accum = !(glowEBBRO->getUseFP16Accumulation() && + (glowEBBRO->getResult().getType()->getElementType() == + glow::ElemKind::Float16Ty)); + + NNPI_LENGTH_TYPE lengthType; + LOG_AND_RETURN_IF_NOT( + ERROR, + convertLengthsModeToLengthType(glowEBBRO->getLengthsMode(), + lengthType) == NNPI_NO_ERROR, + "Unhandled SLS length type", NNPI_INVALID_PARAM); + + return nnpiNetworkAddSparseLengthsWeightedSumOp( + importer.getNetwork(), glowEBBRO->getName().begin(), + nodeValueName(glowEBBRO->getData()).c_str(), + nodeValueName(glowEBBRO->getResult()).c_str(), + nodeValueName(glowEBBRO->getWeights()).c_str(), + nodeValueName(glowEBBRO->getIndices()).c_str(), + nodeValueName(glowEBBRO->getOffsets()).c_str(), usFp32Accum, true, + glowEBBRO->getAvgLength(), lengthType); } }; @@ -1645,13 +1802,21 @@ class RQSLWSNodeImporter : public INNPINodeImporter { (glowSLWS->getResult().getType()->getElementType() == glow::ElemKind::Float16Ty)); + NNPI_LENGTH_TYPE lengthType; + LOG_AND_RETURN_IF_NOT( + ERROR, + convertLengthsModeToLengthType(glowSLWS->getLengthsMode(), + lengthType) == NNPI_NO_ERROR, + "Unhandled SLS length type", NNPI_INVALID_PARAM); + return nnpiNetworkAddSparseLengthsWeightedSumOp( importer.getNetwork(), glowSLWS->getName().begin(), nodeValueName(glowSLWS->getData()).c_str(), nodeValueName(glowSLWS->getResult()).c_str(), nodeValueName(glowSLWS->getWeights()).c_str(), nodeValueName(glowSLWS->getIndices()).c_str(), - nodeValueName(glowSLWS->getLengths()).c_str(), usFp32Accum); + nodeValueName(glowSLWS->getLengths()).c_str(), usFp32Accum, false, + glowSLWS->getAvgLength(), lengthType); } }; @@ -1674,12 +1839,20 @@ class FRQSLSNodeImporter : public INNPINodeImporter { (glowSLWS->getResult().getType()->getElementType() == glow::ElemKind::Float16Ty)); + NNPI_LENGTH_TYPE lengthType; + LOG_AND_RETURN_IF_NOT( + ERROR, + convertLengthsModeToLengthType(glowSLWS->getLengthsMode(), + lengthType) == NNPI_NO_ERROR, + "Unhandled SLS length type", NNPI_INVALID_PARAM); + return nnpiNetworkAddSparseLengthsWeightedSumOp( importer.getNetwork(), glowSLWS->getName().begin(), nodeValueName(glowSLWS->getData()).c_str(), nodeValueName(glowSLWS->getResult()).c_str(), NULL, nodeValueName(glowSLWS->getIndices()).c_str(), - nodeValueName(glowSLWS->getLengths()).c_str(), usFp32Accum); + nodeValueName(glowSLWS->getLengths()).c_str(), usFp32Accum, false, + glowSLWS->getAvgLength(), lengthType); } }; @@ -1703,13 +1876,21 @@ class FRQSLWSNodeImporter : public INNPINodeImporter { (glowSLWS->getResult().getType()->getElementType() == glow::ElemKind::Float16Ty)); + NNPI_LENGTH_TYPE lengthType; + LOG_AND_RETURN_IF_NOT( + ERROR, + convertLengthsModeToLengthType(glowSLWS->getLengthsMode(), + lengthType) == NNPI_NO_ERROR, + "Unhandled SLS length type", NNPI_INVALID_PARAM); + return nnpiNetworkAddSparseLengthsWeightedSumOp( importer.getNetwork(), glowSLWS->getName().begin(), nodeValueName(glowSLWS->getData()).c_str(), nodeValueName(glowSLWS->getResult()).c_str(), nodeValueName(glowSLWS->getWeights()).c_str(), nodeValueName(glowSLWS->getIndices()).c_str(), - nodeValueName(glowSLWS->getLengths()).c_str(), usFp32Accum); + nodeValueName(glowSLWS->getLengths()).c_str(), usFp32Accum, false, + glowSLWS->getAvgLength(), lengthType); } }; @@ -1879,6 +2060,9 @@ std::unordered_map< glow::make_unique>()}, {"AvgPool", glow::make_unique>()}, + {"AdaptiveAvgPool", + glow::make_unique< + AdaptivePoolNodeImporter>()}, {"FullyConnected", glow::make_unique()}, {"SoftMax", glow::make_unique()}, {"Save", glow::make_unique()}, @@ -1956,6 +2140,9 @@ std::unordered_map< {"BatchNormalization", glow::make_unique()}, {"ChannelwiseQuantizedConvolution", glow::make_unique()}, + {"EmbeddingBag", glow::make_unique()}, + {"EmbeddingBagByteRowwiseOffsets", + glow::make_unique()}, }; } diff --git a/lib/Backends/NNPI/InferenceContext.cpp b/lib/Backends/NNPI/InferenceContext.cpp old mode 100644 new mode 100755 index 75aed1b50d..e62e76588a --- a/lib/Backends/NNPI/InferenceContext.cpp +++ b/lib/Backends/NNPI/InferenceContext.cpp @@ -52,16 +52,24 @@ bool InferenceContext::init( const std::unordered_set &staticInputs, std::shared_ptr deviceTracing, StaticPlaceholderMap *staticPlaceholderMap, - const NNPIDeviceOptions *deviceOptions, const std::string &functionName, - unsigned deviceId) { + std::shared_ptr deviceOptions, + const std::string &functionName, unsigned deviceId) { deviceOptions_ = deviceOptions; + deviceId_ = deviceId; nnpiNetwork_ = network; device_ = device; compilationConfig_ = config; partialInputs_ = &partialInputs; deviceTracing_ = deviceTracing; functionName_ = functionName; - deviceId_ = deviceId; + + // Initialize trace context titles with device ID. + std::stringstream deviceInfo; + deviceInfo << "[Device #" << deviceId_ << "] "; + traceBackendExecuteContextName_ = deviceInfo.str() + TRACING_BACKEND_EXECUTE; + tracePreProcessContextName_ = deviceInfo.str() + TRACING_PRE_PROCESS; + traceInferenceContextName_ = deviceInfo.str() + TRACING_INFERENCE; + tracePostProcessContextName_ = deviceInfo.str() + TRACING_POST_PROCESS; LOG_AND_RETURN_IF(ERROR, staticPlaceholderMap == nullptr, "InferenceContext Init was called with an invalid " @@ -95,7 +103,7 @@ bool InferenceContext::init( ERROR, !NNPIResource::UpdateResourceDescFromTensorDesc(&rDesc, &desc), "Failed to update ResourceDesc", false); LOG_AND_RETURN_IF(ERROR, - !inputResources_.back()->Init( + !inputResources_.back()->init( name, deviceOptions_, adapter, device_, &rDesc, NNPIResource::ResourceUsage::InputResource), "Failed to init input resource", false); @@ -116,7 +124,7 @@ bool InferenceContext::init( ERROR, !NNPIResource::UpdateResourceDescFromTensorDesc(&rDesc, &desc), "Failed to update ResourceDesc", false); LOG_AND_RETURN_IF(ERROR, - !outputResources_.back()->Init( + !outputResources_.back()->init( name, deviceOptions_, adapter, device_, &rDesc, NNPIResource::ResourceUsage::OutputResource), "Failed to init input resource", false); @@ -141,6 +149,8 @@ bool InferenceContext::init( LOG_NNPI_INF_IF_ERROR_RETURN_FALSE( nnpiHostNetworkGetInputDesc(hostNetwork, i, name, &desc), "Failed to query NNPI host network input"); + memset(&desc.hostAttrib, 0, sizeof(desc.hostAttrib)); + memset(&desc.deviceAttrib, 0, sizeof(desc.deviceAttrib)); const auto isStaticInput = staticPlaceholders.count(name); if (isStaticInput) { @@ -154,7 +164,7 @@ bool InferenceContext::init( // Create a new static placeholder. inputResources_.emplace_back(std::make_shared()); LOG_AND_RETURN_IF(ERROR, - !inputResources_.back()->Init( + !inputResources_.back()->init( name, deviceOptions_, adapter, device_, &desc, NNPIResource::ResourceUsage::StaticInputResource), "Failed to init static input resource", false); @@ -164,7 +174,7 @@ bool InferenceContext::init( // Regular input resource - create it here. inputResources_.emplace_back(std::make_shared()); LOG_AND_RETURN_IF(ERROR, - !inputResources_.back()->Init( + !inputResources_.back()->init( name, deviceOptions_, adapter, device_, &desc, NNPIResource::ResourceUsage::InputResource), "Failed to init input resource", false); @@ -181,10 +191,11 @@ bool InferenceContext::init( LOG_NNPI_INF_IF_ERROR_RETURN_FALSE( nnpiHostNetworkGetOutputDesc(hostNetwork, i, name, &desc), "Failed to query NNPI host network output"); - + memset(&desc.hostAttrib, 0, sizeof(desc.hostAttrib)); + memset(&desc.deviceAttrib, 0, sizeof(desc.deviceAttrib)); outputResources_.emplace_back(std::make_shared()); LOG_AND_RETURN_IF(ERROR, - !outputResources_.back()->Init( + !outputResources_.back()->init( name, deviceOptions_, adapter, device_, &desc, NNPIResource::ResourceUsage::OutputResource), "Failed to init output resource", false); @@ -259,12 +270,15 @@ void InferenceContext::execute(RunIdentifierTy runId, std::unique_ptr ctx, runtime::ResultCBTy resultCB) { TRACE_EVENT_SCOPE(ctx->getTraceContext(), TraceLevel::REQUEST, - TRACING_BACKEND_EXECUTE); + traceBackendExecuteContextName_); if (ctx->getTraceContext()) { ctx->getTraceContext()->setThreadName( llvm::formatv("Inf ctx - device: {0}: {1}", deviceId_, functionName_) .str()); } + if (deviceTracing_) { + deviceTracing_->start(ctx->getTraceContext(), device_); + } // Pre inference input preparation. PlaceholderBindings &bindings = *ctx->getPlaceholderBindings(); @@ -307,7 +321,7 @@ void InferenceContext::execute(RunIdentifierTy runId, } } TRACE_EVENT_BEGIN(ctx->getTraceContext(), TraceLevel::COPY, - TRACING_PRE_PROCESS); + tracePreProcessContextName_); // Pre-inference std::vector rawInputs, rawOutputs; @@ -331,9 +345,9 @@ void InferenceContext::execute(RunIdentifierTy runId, if (deviceOptions_->enabledCommandLists < 1) { // No command lists (schedule individual commands). TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY, - TRACING_PRE_PROCESS); + tracePreProcessContextName_); TRACE_EVENT_BEGIN(ctx->getTraceContext(), TraceLevel::OPERATOR, - TRACING_INFERENCE); + traceInferenceContextName_); // Queue inference. LOG_AND_CALLBACK_EXECUTE_NNPI_INF_IF_ERROR( nnpiInferCommandQueue(inferCmd_, 0), "Failed to queue infer command.", @@ -363,10 +377,9 @@ void InferenceContext::execute(RunIdentifierTy runId, } TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY, - TRACING_PRE_PROCESS); + tracePreProcessContextName_); TRACE_EVENT_BEGIN(ctx->getTraceContext(), TraceLevel::OPERATOR, - TRACING_INFERENCE); - + traceInferenceContextName_); // Queue Command list LOG_AND_CALLBACK_EXECUTE_NNPI_INF_IF_ERROR( nnpiCommandListQueue(commandList_, &(cmdConfigs_.at(0)), usedConfigs), @@ -435,9 +448,9 @@ void InferenceContext::execute(RunIdentifierTy runId, } TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::OPERATOR, - TRACING_INFERENCE); + traceInferenceContextName_); TRACE_EVENT_BEGIN(ctx->getTraceContext(), TraceLevel::COPY, - TRACING_POST_PROCESS); + tracePostProcessContextName_); // Post inference output handling. for (unsigned i = 0, e = outputResources_.size(); i < e; ++i) { @@ -448,9 +461,12 @@ void InferenceContext::execute(RunIdentifierTy runId, ERROR, outputResources_[i]->PostInference(t) == NNPI_INF_NO_ERROR, "Failed in output PostInference", runId, ctx, resultCB); } - TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY, - TRACING_POST_PROCESS); + TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY, + tracePostProcessContextName_); + if (deviceTracing_) { + deviceTracing_->stopAndUpdate(ctx->getTraceContext(), device_); + } TRACE_EVENT_SCOPE_END(); // we move context in the line below // Invoke CB. diff --git a/lib/Backends/NNPI/InferenceContext.h b/lib/Backends/NNPI/InferenceContext.h index 6ed7a33e6f..28e83f5a75 100644 --- a/lib/Backends/NNPI/InferenceContext.h +++ b/lib/Backends/NNPI/InferenceContext.h @@ -27,6 +27,7 @@ namespace glow { namespace runtime { +class NNPIDeviceManager; using StaticPlaceholderMap = std::unordered_map>; @@ -53,9 +54,12 @@ class InferenceContext { std::shared_ptr deviceTracing_; /// NNPI Device configuration. - const NNPIDeviceOptions *deviceOptions_; + std::shared_ptr deviceOptions_; - // NNPI Resources. + /// NNPI Device id. + unsigned deviceId_; + + /// NNPI Resources. std::vector> inputResources_; std::vector> outputResources_; @@ -67,8 +71,11 @@ class InferenceContext { // Name for the function that we are executing. std::string functionName_; - // Logical device ID this context maps to. - unsigned deviceId_; + /// Trace context names. + std::string traceBackendExecuteContextName_; + std::string tracePreProcessContextName_; + std::string traceInferenceContextName_; + std::string tracePostProcessContextName_; public: InferenceContext(); @@ -85,8 +92,8 @@ class InferenceContext { const std::unordered_set &staticInputs, std::shared_ptr deviceTracing, StaticPlaceholderMap *staticPlaceholderMap, - const NNPIDeviceOptions *deviceOptions, const std::string &functionName, - unsigned deviceId); + std::shared_ptr deviceOptions, + const std::string &functionName, unsigned deviceId); }; } // namespace runtime diff --git a/lib/Backends/NNPI/InferencePool.cpp b/lib/Backends/NNPI/InferencePool.cpp index 2d98b7b21a..245809bd85 100644 --- a/lib/Backends/NNPI/InferencePool.cpp +++ b/lib/Backends/NNPI/InferencePool.cpp @@ -50,11 +50,11 @@ Error InferencePoolEnv::init(unsigned numWorkers, NNPIAdapter adapter, std::shared_ptr deviceTracing, CompiledFunction *compiledFunction, StaticPlaceholderMap *staticPlaceholderMap, - const NNPIDeviceOptions *deviceOptions, + std::shared_ptr deviceOptions, const std::string &functionName, unsigned deviceId) { - deviceOptions_ = deviceOptions; + deviceId_ = deviceId; if (workersPool_) { return MAKE_ERR("InferencePool already initialized!"); } @@ -119,7 +119,7 @@ Error InferencePoolEnv::init(unsigned numWorkers, NNPIAdapter adapter, nnpiFunction->getCompilationConfig(), hostNetwork_, deviceNetwork_, adapter, device, nnpiFunction->getPartialInputs(), nnpiFunction->getStaticInputs(), deviceTracing_, staticPlaceholderMap, - deviceOptions, functionName, deviceId); + deviceOptions, functionName, deviceId_); if (!success) { return MAKE_ERR("Failed to initialize inferece context"); } diff --git a/lib/Backends/NNPI/InferencePool.h b/lib/Backends/NNPI/InferencePool.h index 716d68ad49..512da09b80 100644 --- a/lib/Backends/NNPI/InferencePool.h +++ b/lib/Backends/NNPI/InferencePool.h @@ -40,7 +40,8 @@ class InferencePoolEnv { NNPIHostNetwork hostNetwork_; NNPIDeviceNetwork deviceNetwork_; std::shared_ptr deviceTracing_; - const NNPIDeviceOptions *deviceOptions_; // why not shared ptr? + std::shared_ptr deviceOptions_; + unsigned deviceId_; public: InferencePoolEnv(); @@ -49,7 +50,7 @@ class InferencePoolEnv { std::shared_ptr deviceTracing, CompiledFunction *compiledFunction, StaticPlaceholderMap *staticPlaceholderMap, - const NNPIDeviceOptions *deviceOptions, + std::shared_ptr deviceOptions, const std::string &functionName, unsigned deviceId); void stop(bool block); void execute(RunIdentifierTy runId, std::unique_ptr ctx, diff --git a/lib/Backends/NNPI/NNPI.cpp b/lib/Backends/NNPI/NNPI.cpp index e2dd649bb8..ac86826bc4 100644 --- a/lib/Backends/NNPI/NNPI.cpp +++ b/lib/Backends/NNPI/NNPI.cpp @@ -14,6 +14,7 @@ */ #include "NNPI.h" +#include "DebugMacros.h" #include "NNPICompiledFunction.h" #include "NNPIDeviceManager.h" #include "glow/Graph/Nodes.h" @@ -78,23 +79,25 @@ int32_t GlowNNPINumParallelChunks = 1; } // namespace glow NNPIBackendOptions NNPIBackend::backendOptions_; +NNPIAdapterContainer NNPIBackend::adapter_; unsigned NNPIBackend::numDevices() { - // TODO: unify with numHabanaDevices. copy-paste with a different device - // name. - std::ifstream devices("/proc/bus/pci/devices"); - std::string device; - unsigned count = 0; - while (std::getline(devices, device)) { - if (device.find("sph_pcie") != std::string::npos) { - count++; - } - } - if (count > 0) { - return count; - } - // TODO: Fall back to emulator since GLOW_NNPI is set. This feels hacky. - return 1; + NNPIAdapter adapter = NNPI_INVALID_NNPIHANDLE; + NNPIAdapterInfo adapterInfo; + memset(&adapterInfo, 0, sizeof(adapterInfo)); + // Assuming ICE-Ref will be used if not able to create adaper of get adapter + // info (returning 1 device). + LOG_AND_RETURN_IF_NOT( + ERROR, nnpiAdapterCreate(nullptr, &adapter) == NNPI_INF_NO_ERROR, + "Failed to create NNPI Adapter.", 1); + LOG_AND_RETURN_IF_NOT( + ERROR, nnpiAdapterGetInfo(adapter, &adapterInfo) == NNPI_INF_NO_ERROR, + "Failed get device info.", 1); + unsigned count = adapterInfo.numDevices; + LOG_NNPI_INF_IF_ERROR(nnpiAdapterDestroy(adapter), + "Failed to destroy NNPI Adapter"); + // Will return 1 device (for ICE-Ref) if 0 devices are found. + return std::max(count, (unsigned)1); } /// \returns whether \p type is 2 dimensional and unary. Usually the data input @@ -146,7 +149,6 @@ bool NNPIBackend::isOpSupported(const NodeInfo &NI) const { case Kinded::Kind::BatchedReduceMeanNodeKind: case Kinded::Kind::BatchedReduceMinNodeKind: case Kinded::Kind::LocalResponseNormalizationNodeKind: - case Kinded::Kind::AvgPoolNodeKind: case Kinded::Kind::BatchedAddNodeKind: case Kinded::Kind::TanhNodeKind: case Kinded::Kind::LogNodeKind: @@ -158,8 +160,10 @@ bool NNPIBackend::isOpSupported(const NodeInfo &NI) const { ElemKind::Int32ITy, ElemKind::Int64ITy}); case Kinded::Kind::BatchNormalizationNodeKind: + case Kinded::Kind::AvgPoolNodeKind: + case Kinded::Kind::AdaptiveAvgPoolNodeKind: return NI.allInputsAndOutputsHaveSameElemKind( - {ElemKind::FloatTy, ElemKind::Float16Ty}); + {ElemKind::FloatTy, ElemKind::Float16Ty, ElemKind::Int8QTy}); case Kinded::Kind::BatchMatMulNodeKind: case Kinded::Kind::PReluNodeKind: @@ -328,6 +332,28 @@ bool NNPIBackend::isOpSupported(const NodeInfo &NI) const { (NI.getInElemTy(SparseLengthsWeightedSumNode::LengthsIdx) == ElemKind::Int32ITy); + case Kinded::Kind::EmbeddingBagNodeKind: + return NI.allInputsAndOutputsHaveSameElemKind( + {ElemKind::FloatTy, ElemKind::Float16Ty, ElemKind::Int8QTy}, + {EmbeddingBagNode::IndicesIdx, EmbeddingBagNode::OffsetsIdx}) && + (NI.getInElemTy(EmbeddingBagNode::IndicesIdx) == + ElemKind::Int64ITy) && + (NI.getInElemTy(EmbeddingBagNode::OffsetsIdx) == ElemKind::Int64ITy); + + case Kinded::Kind::EmbeddingBagByteRowwiseOffsetsNodeKind: { + auto dataK = NI.getInElemTy(EmbeddingBagByteRowwiseOffsetsNode::DataIdx); + auto offsetsK = + NI.getInElemTy(EmbeddingBagByteRowwiseOffsetsNode::OffsetsIdx); + auto indicesK = + NI.getInElemTy(EmbeddingBagByteRowwiseOffsetsNode::IndicesIdx); + auto resultK = + NI.getOutElemTy(EmbeddingBagByteRowwiseOffsetsNode::ResultIdx); + return (dataK == ElemKind::UInt8FusedQTy || + dataK == ElemKind::UInt8FusedFP16QTy) && + (resultK == ElemKind::FloatTy || resultK == ElemKind::Float16Ty) && + (indicesK == ElemKind::Int64ITy) && (offsetsK == ElemKind::Int64ITy); + } + case Kinded::Kind::FusedRowwiseQuantizedSparseLengthsSumNodeKind: { auto dataK = NI.getInElemTy(FusedRowwiseQuantizedSparseLengthsSumNode::DataIdx); @@ -445,13 +471,14 @@ bool NNPIBackend::shouldLower(const Node *N) const { case Kinded::Kind::BatchMatMulNodeKind: case Kinded::Kind::BatchNormalizationNodeKind: case Kinded::Kind::ChannelwiseQuantizedConvolutionNodeKind: + case Kinded::Kind::AdaptiveAvgPoolNodeKind: + case Kinded::Kind::EmbeddingBagNodeKind: + case Kinded::Kind::EmbeddingBagByteRowwiseOffsetsNodeKind: return false; case Kinded::Kind::FusedRowwiseQuantizedSparseLengthsSumNodeKind: { const FusedRowwiseQuantizedSparseLengthsSumNode *SLSN = llvm::cast(N); - if ((backendOptions_.useIceT || backendOptions_.inferOnDevice) && - (SLSN->getData().getElementType() != ElemKind::UInt4FusedFP16QTy) && - (SLSN->getResult().getElementType() == ElemKind::Float16Ty)) { + if (SLSN->getResult().getElementType() == ElemKind::Float16Ty) { return false; // Don't lower == keep without weights } else { return true; @@ -477,7 +504,7 @@ bool NNPIBackend::shouldLower(const Node *N) const { runtime::DeviceManager * NNPIBackend::createDeviceManager(const runtime::DeviceConfig &deviceConfig) { - return createNNPIDeviceManager(deviceConfig); + return createNNPIDeviceManager(deviceConfig, &adapter_); } Expected> diff --git a/lib/Backends/NNPI/NNPI.h b/lib/Backends/NNPI/NNPI.h index 6a91e416d5..58a3b7eaae 100644 --- a/lib/Backends/NNPI/NNPI.h +++ b/lib/Backends/NNPI/NNPI.h @@ -16,6 +16,7 @@ #ifndef GLOW_NNPI_BACKEND_H #define GLOW_NNPI_BACKEND_H +#include "NNPIAdapterContainer.h" #include "NNPIOptions.h" #include "glow/Backend/Backend.h" @@ -75,6 +76,7 @@ class NNPIBackend final : public Backend { #endif /* FACEBOOK_INTERNAL */ static NNPIBackendOptions backendOptions_; + static NNPIAdapterContainer adapter_; }; } // namespace glow diff --git a/lib/Backends/NNPI/NNPIAdapterContainer.cpp b/lib/Backends/NNPI/NNPIAdapterContainer.cpp new file mode 100644 index 0000000000..7f9a103d34 --- /dev/null +++ b/lib/Backends/NNPI/NNPIAdapterContainer.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) Glow Contributors. See CONTRIBUTORS file. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "NNPIAdapterContainer.h" + +using namespace glow; + +NNPIAdapterContainer::~NNPIAdapterContainer() { + std::unique_lock lock(adapterMutex_); + if (nnpiAdapter_ != NNPI_INVALID_NNPIHANDLE) { + LOG_NNPI_INF_IF_ERROR(nnpiAdapterDestroy(nnpiAdapter_), + "Failed to destroy NNPI adapter"); + nnpiAdapter_ = NNPI_INVALID_NNPIHANDLE; + } +} + +NNPIAdapter NNPIAdapterContainer::get(bool inferOnDevice) { + std::unique_lock lock(adapterMutex_); + if (inferOnDevice) { + if (nnpiAdapter_ == NNPI_INVALID_NNPIHANDLE) { + LOG_NNPI_INF_IF_ERROR(nnpiAdapterCreate(nullptr, &nnpiAdapter_), + "Failed to create NNPI Adapter"); + } + } else { + return NNPI_INVALID_NNPIHANDLE; + } + return nnpiAdapter_; +} diff --git a/lib/Backends/NNPI/NNPIAdapterContainer.h b/lib/Backends/NNPI/NNPIAdapterContainer.h new file mode 100644 index 0000000000..6b90fc19c3 --- /dev/null +++ b/lib/Backends/NNPI/NNPIAdapterContainer.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) Glow Contributors. See CONTRIBUTORS file. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GLOW_NNPI_ADAPTER_H +#define GLOW_NNPI_ADAPTER_H + +#include "DebugMacros.h" +#include "NNPIOptions.h" +#include "glow/Backend/Backend.h" +#include "nnpi_inference.h" +#include + +namespace glow { +class NNPIAdapterContainer { +public: + NNPIAdapterContainer() : nnpiAdapter_(NNPI_INVALID_NNPIHANDLE) {} + + ~NNPIAdapterContainer(); + + NNPIAdapter get(bool inferOnDevice); + +private: + /// NNPI Adapter handle. + NNPIAdapter nnpiAdapter_; + /// Lock to synchronize function adding/removing to/from the device manager. + std::mutex adapterMutex_; +}; +} // namespace glow +#endif // GLOW_NNPI_ADAPTER_H diff --git a/lib/Backends/NNPI/NNPICompiledFunction.cpp b/lib/Backends/NNPI/NNPICompiledFunction.cpp index ca1c0b7404..63eea8d3f3 100644 --- a/lib/Backends/NNPI/NNPICompiledFunction.cpp +++ b/lib/Backends/NNPI/NNPICompiledFunction.cpp @@ -126,7 +126,12 @@ Error NNPICompiledFunction::compile(Function *F, const BackendOptions &opts) { } if (compilationOptions_.useIceT || compilationOptions_.inferOnDevice) { - compilationFileName_ = compilationOptions_.compiledFile.get(); + if (compilationOptions_.compileOutputPostfix) { + compilationFileName_ = compilationOptions_.compiledFile.get() + "_" + + std::string(F->getName()); + } else { + compilationFileName_ = compilationOptions_.compiledFile.get(); + } LOG_IF_NOT_RETURN_LLVMERROR( compilationFileName_.length() < NNPI_MAX_STRING_LEN, "Bad filename"); diff --git a/lib/Backends/NNPI/NNPIDeviceManager.cpp b/lib/Backends/NNPI/NNPIDeviceManager.cpp old mode 100644 new mode 100755 index f217d790b7..5945feb163 --- a/lib/Backends/NNPI/NNPIDeviceManager.cpp +++ b/lib/Backends/NNPI/NNPIDeviceManager.cpp @@ -18,6 +18,7 @@ #include "InferencePool.h" #include "NNPI.h" #include "NNPICompiledFunction.h" +#include "NNPITracing.h" #include "glow/Support/Error.h" #include "nnpi_inference.h" #include "nnpi_transformer.h" @@ -39,8 +40,16 @@ static llvm::cl::opt "per NNPI device, in kilobytes"), llvm::cl::location(GlowNNPIMemory)); -DeviceManager *createNNPIDeviceManager(const DeviceConfig &config) { - return new NNPIDeviceManager(config); +DeviceManager *createNNPIDeviceManager(const DeviceConfig &config, + NNPIAdapterContainer *adapter) { + std::shared_ptr deviceOptions = + std::make_shared(config.parameters); + NNPIAdapter nnpiAdapter = adapter->get(deviceOptions->inferOnDevice); + if (deviceOptions->inferOnDevice && nnpiAdapter == NNPI_INVALID_NNPIHANDLE) { + LOG(ERROR) << "Adapter allocation failed"; + return nullptr; + } + return new NNPIDeviceManager(config, deviceOptions, nnpiAdapter); } // 1K bytes. @@ -49,29 +58,32 @@ static constexpr uint64_t KB = 1 << 10; ////////////////////////////////////////////////////////////////////////// std::atomic NNPIDeviceManager::runIdentifier_; -NNPIDeviceManager::NNPIDeviceManager(const DeviceConfig &config, - unsigned numInferenceWorkers) +NNPIDeviceManager::NNPIDeviceManager( + const DeviceConfig &config, + std::shared_ptr deviceOptions, NNPIAdapter adapter, + unsigned numInferenceWorkers) : DeviceManager(config), numWorkersPerFunction_(numInferenceWorkers), - deviceId_(config_.deviceID), adapter_(NNPI_INVALID_NNPIHANDLE), - device_(NNPI_INVALID_NNPIHANDLE), deviceOptions_(config_.parameters) { - if (deviceOptions_.showVars) { - LOG(INFO) << deviceOptions_.dumpStatus(); + deviceId_(config_.deviceID), device_(NNPI_INVALID_NNPIHANDLE), + deviceOptions_(deviceOptions), adapter_(adapter) { + + if (deviceOptions_->showVars) { + LOG(INFO) << deviceOptions_->dumpStatus(); } - if (deviceOptions_.deviceID >= 0) { - deviceId_ = static_cast(deviceOptions_.deviceID); + if (deviceOptions_->deviceId >= 0) { + deviceId_ = static_cast(deviceOptions_->deviceId); } if (!numWorkersPerFunction_) { numWorkersPerFunction_ = 2; } - if (deviceOptions_.numWorkers > 0) { - numWorkersPerFunction_ = deviceOptions_.numWorkers; + if (deviceOptions_->numWorkers > 0) { + numWorkersPerFunction_ = deviceOptions_->numWorkers; } // Ice-ref not re-entrant for the same nnpiNetwork. numWorkersPerFunction_ = - deviceOptions_.inferOnDevice ? numWorkersPerFunction_ : 1; + deviceOptions_->inferOnDevice ? numWorkersPerFunction_ : 1; } NNPIDeviceManager::~NNPIDeviceManager() { @@ -93,22 +105,20 @@ NNPIDeviceManager::~NNPIDeviceManager() { << "Static placeholder has pending refs"; } + // Verify all static placeholders have no external refs. + for (auto &res : staticPlaceholders_) { + LOG_ERROR_IF_NOT(res.second.use_count() == 0) + << "Static placeholder has pending refs"; + } + if (device_ != NNPI_INVALID_NNPIHANDLE) { LOG_NNPI_INF_IF_ERROR(nnpiDeviceContextDestroy(device_), "Failed to destroy NNPI device context"); device_ = NNPI_INVALID_NNPIHANDLE; } - - if (adapter_ != NNPI_INVALID_NNPIHANDLE) { - LOG_NNPI_INF_IF_ERROR(nnpiAdapterDestroy(adapter_), - "Failed to destroy NNPI adapter"); - adapter_ = NNPI_INVALID_NNPIHANDLE; - } } Error NNPIDeviceManager::init() { - LOG_IF_NOT_RETURN_LLVMERROR(adapter_ == NNPI_INVALID_NNPIHANDLE, - "Invalid NNPI adapter"); LOG_IF_NOT_RETURN_LLVMERROR(device_ == NNPI_INVALID_NNPIHANDLE, "Invalid NNPI device"); @@ -118,16 +128,12 @@ Error NNPIDeviceManager::init() { << info.minorVersion << "." << info.patchVersion << "." << info.minorPatchVersion; - if (deviceOptions_.inferOnDevice) { - // Create NNPI adapter. - LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( - nnpiAdapterCreate(nullptr, &adapter_), "Failed to create NNPI Adapter"); - + if (deviceOptions_->inferOnDevice) { // Create NNPI device. LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR( nnpiDeviceContextCreate(adapter_, deviceId_, &device_), "Failed to create NNPI Device"); - if (deviceOptions_.enabledDeviceTracing) { + if (deviceOptions_->enabledDeviceTracing) { deviceTracing_ = NNPIDeviceTracing::getForDevice(deviceId_); } NNPIDeviceInfo deviceInfo; @@ -147,8 +153,8 @@ Error NNPIDeviceManager::init() { } if (GlowNNPIMemory > 0) { maxMemoryBytes_ = static_cast(GlowNNPIMemory) * KB; - } else if (deviceOptions_.deviceMemory > 0) { - maxMemoryBytes_ = static_cast(deviceOptions_.deviceMemory) * KB; + } else if (deviceOptions_->deviceMemory > 0) { + maxMemoryBytes_ = static_cast(deviceOptions_->deviceMemory) * KB; } runIdentifier_ = 0; @@ -196,7 +202,7 @@ void NNPIDeviceManager::addNetwork(const Module *module, usedMemoryBytes_ += functionCost_; // TODO:: static moduleSize. auto err = inferenceEnvs_[func.first].init( numWorkersPerFunction_, adapter_, device_, deviceTracing_, func.second, - &staticPlaceholders_, &deviceOptions_, func.first, deviceId_); + &staticPlaceholders_, deviceOptions_, func.first, deviceId_); if (err) { functions_.erase(func.first); lock.unlock(); @@ -278,8 +284,8 @@ Error NNPIDeviceManager::stop(bool block) { } uint64_t NNPIDeviceManager::getMaximumMemory() const { return maxMemoryBytes_; } uint64_t NNPIDeviceManager::getAvailableMemory() const { - if (GlowNNPIMemory == 0 && deviceOptions_.deviceMemory == 0 && - deviceOptions_.inferOnDevice) { + if (GlowNNPIMemory == 0 && deviceOptions_->deviceMemory == 0 && + deviceOptions_->inferOnDevice) { NNPIDeviceStatus devStatus; NNPIInferenceErrorCode res = nnpiDeviceGetStatus(deviceId_, &devStatus); if (res != NNPI_INF_NO_ERROR) { @@ -316,5 +322,21 @@ void NNPIDeviceManager::transferStaticPlaceholderToDevice( nnpiResource->UpdateDeviceResourceFromTensor(T, resultCB); }; +Error NNPIDeviceManager::startDeviceTrace(TraceContext *traceContext) { + if (!NNPIDeviceTracing::getForDevice(deviceId_)->start(traceContext, + device_)) { + return MAKE_ERR("Failed to start NNPI device trace."); + } + return Error::success(); +} + +Error NNPIDeviceManager::stopDeviceTrace(TraceContext *traceContext) { + if (!NNPIDeviceTracing::getForDevice(deviceId_)->stopAndUpdate(traceContext, + device_)) { + return MAKE_ERR("Failed to stop NNPI device trace."); + } + return Error::success(); +} + } // namespace runtime } // namespace glow diff --git a/lib/Backends/NNPI/NNPIDeviceManager.h b/lib/Backends/NNPI/NNPIDeviceManager.h index bae2e95a87..8d93cb5429 100644 --- a/lib/Backends/NNPI/NNPIDeviceManager.h +++ b/lib/Backends/NNPI/NNPIDeviceManager.h @@ -17,6 +17,7 @@ #define GLOW_BACKENDS_NNPI_NNPIDEVICEMANAGER_H #include "InferencePool.h" +#include "NNPIAdapterContainer.h" #include "NNPITracing.h" #include "glow/Backends/DeviceManager.h" #include "glow/Runtime/RuntimeTypes.h" @@ -30,7 +31,6 @@ namespace glow { class NNPICompiledFunction; - namespace runtime { class NNPIResource; @@ -74,10 +74,12 @@ class NNPIDeviceManager : public DeviceManager { /// manager). StaticPlaceholderMap staticPlaceholders_; /// NNPI Device options (environment variables + DeviceConfig options). - NNPIDeviceOptions deviceOptions_; + std::shared_ptr deviceOptions_; public: explicit NNPIDeviceManager(const DeviceConfig &config, + std::shared_ptr deviceOptions, + NNPIAdapter adapter, unsigned numInferenceWorkers = 0); virtual ~NNPIDeviceManager(); @@ -96,9 +98,13 @@ class NNPIDeviceManager : public DeviceManager { void transferStaticPlaceholderToDevice(Placeholder *PH, Tensor *T, std::function resultCB); + + virtual Error startDeviceTrace(TraceContext *traceContext) override; + virtual Error stopDeviceTrace(TraceContext *traceContext) override; }; -DeviceManager *createNNPIDeviceManager(const DeviceConfig &config); +DeviceManager *createNNPIDeviceManager(const DeviceConfig &config, + NNPIAdapterContainer *adapter); } // namespace runtime } // namespace glow diff --git a/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp b/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp old mode 100644 new mode 100755 index 9338f3c736..30e156132f --- a/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp +++ b/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp @@ -15,6 +15,7 @@ #include "NNPIMLTraceWrapper.h" #include "DebugMacros.h" +#include "nnpi_inference.h" #include #include #include @@ -36,6 +37,12 @@ static inline int64_t nanosecondsToMicrosecondsSigned(int64_t nanoseconds) { return nanoseconds / 1e3; } +static uint64_t inline getNow() { + return std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); +} + enum NNPITraceColumnIndex { NNPI_TRACE_PID_IDX = 0, NNPI_TRACE_CPU_IDX = 1, @@ -47,13 +54,6 @@ enum NNPITraceColumnIndex { class NNPITraceParser { public: - NNPITraceParser(uint64_t timeDiff, uint64_t upRefTime) - : timeDiff_(timeDiff), upRefTime_(upRefTime), refTimeDiff_(0.0){}; - - int64_t getTimeDiff() { - return abs(refTimeDiff_) > 0 ? refTimeDiff_ : timeDiff_; - }; - void parseLine(std::string line, NNPITraceEntry &entry) { size_t idx = 0; std::istringstream linestream(line); @@ -76,6 +76,7 @@ class NNPITraceParser { } case NNPI_TRACE_TIMESTAMP_IDX: { entry.deviceUpTime = getOriginTime(part); + entry.hostTime = entry.deviceUpTime; break; } case NNPI_TRACE_FUNCTION_IDX: { @@ -99,23 +100,6 @@ class NNPITraceParser { } idx++; } while (linestream); - setHostTime(entry); - } - - void setHostTime(NNPITraceEntry &entry) { - if (refTimeDiff_ == 0 && upRefTime_ != 0 && - entry.traceType == NNPI_TRACE_COPY) { - if (entry.params.count("isC2H") > 0 && entry.params["isC2H"] == "0" && - entry.params.count("state") > 0 && entry.params["state"] == "q") { - refTimeDiff_ = entry.deviceUpTime - upRefTime_; - timeDiff_ = refTimeDiff_; - } - } - if (refTimeDiff_ != 0) { - entry.hostTime = (entry.deviceUpTime - refTimeDiff_); - } else { - entry.hostTime = (entry.deviceUpTime - timeDiff_); - } } protected: @@ -124,7 +108,6 @@ class NNPITraceParser { std::string pid; while (std::getline(partSplitStream, pid, '-')) ; - return std::stoi(pid); } @@ -150,12 +133,32 @@ class NNPITraceParser { return NNPI_TRACE_DMA; } else if (part == "copy:") { return NNPI_TRACE_COPY; + } else if (part == "cmdlist:") { + return NNPI_TRACE_CMDLIST; + } else if (part == "icedrvExecuteNetwork:") { + return NNPI_TRACE_NETEXEC; + } else if (part == "runtime-subgraph:") { + return NNPI_TRACE_SUBGRAPH; } else if (part == "infreq:") { return NNPI_TRACE_INFER; } else if (part == "clock_sync:") { return NNPI_TRACE_CLOCK_SYNC; } else if (part == "tracing_mark_write:") { return NNPI_TRACE_MARK; + } else if (part == "vtune_time_sync:") { + return NNPI_TARCE_TIME_SYNC; + } else if (part == "runtime-infer-request:") { + return NNPI_TRACE_RUNTIME_INFER; + } else if (part == "icedrvScheduleJob:") { + return NNPI_TRACE_ICED_SCHED_JOB; + } else if (part == "icedrvCreateNetwork:") { + return NNPI_TARCE_ICED_CREAT_NET; + } else if (part == "icedrvNetworkResource:") { + return NNPI_TARCE_ICED_NET_RES; + } else if (part == "icedrvEventGeneration:") { + return NNPI_TARCE_ICED_NET_GEN; + } else if (part == "user_data:") { + return NNPI_TARCE_USER_DATA; } return NNPI_TRACE_OTHER; } @@ -171,61 +174,53 @@ class NNPITraceParser { value = value.substr(0, value.size() - 2); } entry.params[name] = value; - if (refTimeDiff_ == 0 && entry.traceType == NNPI_TRACE_CLOCK_SYNC && - name == "clock_diff_in_nanosec") { - // Nanoseconds to microseconds. - timeDiff_ = nanosecondsToMicrosecondsSigned(std::stol(value)); - } return true; } - int64_t timeDiff_; - uint64_t upRefTime_; - int64_t refTimeDiff_; }; -NNPITraceContext::NNPITraceContext(uint32_t eventsMask) - : devID_(0), devIDSet_(false), events_("copy,infreq") { - if (eventsMask) { - events_ = ""; - if (eventsMask & NNPI_TRACE_DMA) { - events_ += "dma,"; - } - if (eventsMask & NNPI_TRACE_COPY) { - events_ += "copy,"; - } - if (eventsMask & NNPI_TRACE_INFER) { - events_ += "infreq"; - } - } - createContext(); -} +#define NNPI_SOFTWARE_EVENTS \ + "cmdlist,copy,cpylist_create,icedrvCreateContext,icedrvCreateNetwork," \ + "icedrvDestroyContext,icedrvDestroyNetwork,icedrvEventGeneration," \ + "icedrvExecuteNetwork,icedrvNetworkResource,icedrvScheduleJob,inf_net_" \ + "subres,infreq,runtime_sw_events.runtime.infer,runtime_sw_events.runtime." \ + "subgraph,user_data" -NNPITraceContext::~NNPITraceContext() { - nnpimlDestroyTraceContext(traceCtx_); - traceCtx_ = 0; -} +NNPITraceContext::NNPITraceContext(unsigned devID) + : traceCtx_(0), devID_(devID), devIDSet_(false), + events_(NNPI_SOFTWARE_EVENTS) {} -bool NNPITraceContext::startCapture() const { - if (!(1UL << devID_ & devMask_)) { - // Can't start for this device. +NNPITraceContext::~NNPITraceContext() { destroyInternalContext(); } + +bool NNPITraceContext::startCapture(NNPIDeviceContext deviceContext) { + if (!createInternalContext()) { + LOG(WARNING) << "nnpi_trace: Failed to create trace device context."; return false; } nnpimlTraceOptions traceOptions; std::memset(&traceOptions, 0, sizeof(nnpimlTraceOptions)); traceOptions.max_bytes = MAX_TRACE_BUFFER_SIZE; traceOptions.max_bytes_valid = true; + nnpimlStatus mlStatus = nnpimlTraceStart(traceCtx_, devID_, &traceOptions, events_.c_str()); if (mlStatus != NNPIML_SUCCESS) { - LOG(WARNING) << "nnpi_trace: Failed to read trace file, err=" << mlStatus; + LOG(WARNING) << "nnpi_trace: Failed to start trace, err=" << mlStatus; return false; } + LOG_NNPI_INF_IF_ERROR( + nnpiDeviceContextTraceUserData(deviceContext, "BG", getNow()), + "Failed to inject trace timestamp - device trace may not be " + "synchronized"); return true; } -bool NNPITraceContext::stopCapture() const { +bool NNPITraceContext::stopCapture(NNPIDeviceContext deviceContext) const { uint32_t outBytes, discardEvents; + LOG_NNPI_INF_IF_ERROR( + nnpiDeviceContextTraceUserData(deviceContext, "EN", getNow()), + "Failed to inject trace timestamp - device trace may not be " + "synchronized"); nnpimlStatus mlStatus = nnpimlTraceStop(traceCtx_, devID_, &outBytes, &discardEvents); if (mlStatus != NNPIML_SUCCESS) { @@ -234,76 +229,114 @@ bool NNPITraceContext::stopCapture() const { return true; } -bool NNPITraceContext::load() { - entries_.clear(); - std::stringstream inputStream; +bool NNPITraceContext::readTraceOutput(std::stringstream &inputStream) { + char readData[TRACE_READ_BUFFER_SIZE + 1]; uint32_t size = TRACE_READ_BUFFER_SIZE; uint32_t actualSize = size; - char readData[TRACE_READ_BUFFER_SIZE + 1]; - // Read trace bytes into stream. + uint32_t offset = 0; while (actualSize >= size) { nnpimlStatus mlStatus = - nnpimlTraceRead(traceCtx_, devID_, 0, size, readData, &actualSize); + nnpimlTraceRead(traceCtx_, devID_, offset, size, readData, &actualSize); inputStream.write(readData, actualSize); + offset += actualSize; if (mlStatus != NNPIML_SUCCESS) { // Failed to read trace. return false; } } + return true; +} + +bool NNPITraceContext::load() { + entries_.clear(); + std::stringstream inputStream; + + if (!readTraceOutput(inputStream)) { + destroyInternalContext(); + return false; + } + destroyInternalContext(); // Handle stream. std::string line; - NNPITraceParser parser(timeDiff_, upRefTime_); + NNPITraceParser parser; + bool started = false; + uint64_t glowStart = 0; + uint64_t glowEnd = 0; + uint64_t nnpiStart = 0; + uint64_t nnpiEnd = 0; + while (std::getline(inputStream, line)) { if (line.find("#", 0) == 0) { // Skip comment. continue; } NNPITraceEntry entry; - parser.parseLine(line, entry); - entries_.push_back(entry); - if (timeDiff_ != parser.getTimeDiff()) { - // On time diff updated update old entries. - timeDiff_ = parser.getTimeDiff(); - for (std::vector::iterator it = - entries_.begin() + timeUpdatedIndex_; - it != entries_.end() - 1; ++it) { - parser.setHostTime(*it); - timeUpdatedIndex_++; + if (entry.traceType == NNPI_TARCE_USER_DATA) { + if (!started && entry.params["key"] == "BG") { + auto p = entry.params["user_data"]; + glowStart = std::stol(entry.params["user_data"]); + nnpiStart = entry.deviceUpTime; + started = true; + } else if (entry.params["key"] == "EN") { + auto p = entry.params["user_data"]; + glowEnd = std::stol(entry.params["user_data"]); + nnpiEnd = entry.deviceUpTime; + started = false; } } + if (started) { + entries_.push_back(entry); + } + } + if (glowStart > 0 && glowEnd > 0 && nnpiStart > 0 && nnpiEnd > 0) { + // Calculate host time function. + double m = (double)(glowEnd - glowStart) / (double)(nnpiEnd - nnpiStart); + int64_t C = glowStart - m * nnpiStart; + // Update host time. + for (NNPITraceEntry &entry : entries_) { + entry.hostTime = entry.deviceUpTime * m + C; + } + } else { + LOG(WARNING) << "Failed to synchronize glow and nnpi device traces."; } return true; } -bool NNPITraceContext::setDeviceID(uint32_t devID) { - if (devIDSet_) { +bool NNPITraceContext::destroyInternalContext() { + if (traceCtx_ == 0) { return false; } - if (!(1UL << devID & devMask_)) { - // Can't start for this device. + nnpimlStatus mlStatus = nnpimlDestroyTraceContext(traceCtx_); + traceCtx_ = 0; + if (mlStatus != NNPIML_SUCCESS) { + LOG(WARNING) << "nnpi_trace: Failed to stop device trace, err=" << mlStatus; + traceCtx_ = 0; return false; } - devIDSet_ = true; - devID_ = devID; + return true; } -bool NNPITraceContext::createContext() { +bool NNPITraceContext::createInternalContext() { + if (traceCtx_ != 0) { + return false; + } + devMask_ = 1UL << devID_; nnpimlStatus mlStatus = - nnpimlCreateTraceContext(UINT64_MAX, &traceCtx_, &devMask_); + nnpimlCreateTraceContext(devMask_, &traceCtx_, &devMask_); if (mlStatus != NNPIML_SUCCESS) { - LOG(WARNING) << "nnpi_trace: Failed to read trace file, err=" << mlStatus; + LOG(WARNING) << "nnpi_trace: Failed to start device trace, err=" + << mlStatus; traceCtx_ = 0; return false; } - return true; -} - -void NNPITraceContext::markInputCopyStart(uint64_t uptime) { - if (upRefTime_ == 0) { - upRefTime_ = uptime; + if (!(1UL << devID_ & devMask_)) { + destroyInternalContext(); + LOG(WARNING) << "nnpi_trace: Cloud not open trace for device " << devID_; + return false; } -} + return true; +} \ No newline at end of file diff --git a/lib/Backends/NNPI/NNPIMLTraceWrapper.h b/lib/Backends/NNPI/NNPIMLTraceWrapper.h old mode 100644 new mode 100755 index b67f86fbc7..3d113c5304 --- a/lib/Backends/NNPI/NNPIMLTraceWrapper.h +++ b/lib/Backends/NNPI/NNPIMLTraceWrapper.h @@ -17,6 +17,7 @@ #define NNPI_NNPITRACING_ML_WRAPPER_H #include +#include #include #include @@ -27,6 +28,16 @@ enum NNPITraceType { NNPI_TRACE_COPY = 0x0004, NNPI_TRACE_MARK = 0x0008, NNPI_TRACE_CLOCK_SYNC = 0x0010, + NNPI_TRACE_CMDLIST = 0x0020, + NNPI_TRACE_NETEXEC = 0x0040, + NNPI_TRACE_SUBGRAPH = 0x0080, + NNPI_TARCE_TIME_SYNC = 0x0100, + NNPI_TRACE_RUNTIME_INFER = 0x0200, + NNPI_TRACE_ICED_SCHED_JOB = 0x0400, + NNPI_TARCE_ICED_CREAT_NET = 0x0800, + NNPI_TARCE_ICED_NET_RES = 0x1000, + NNPI_TARCE_ICED_NET_GEN = 0x1001, + NNPI_TARCE_USER_DATA = 0x4000, NNPI_TRACE_OTHER = 0x8000 }; @@ -43,12 +54,12 @@ struct NNPITraceEntry { /// Device trace api wrapper. class NNPITraceContext { public: - NNPITraceContext(uint32_t eventsMask); + NNPITraceContext(unsigned devID); virtual ~NNPITraceContext(); /// Start capturing traces from the HW device. - bool startCapture() const; + bool startCapture(NNPIDeviceContext deviceContext); /// Start capturing. - bool stopCapture() const; + bool stopCapture(NNPIDeviceContext deviceContext) const; /// Load traces (valid only after stopCapture()). bool load(); /// Returns the number of traces captured and loaded (valid only after @@ -56,8 +67,6 @@ class NNPITraceContext { size_t getTraceCount() const { return entries_.size(); } /// Read a loaded entry by index. NNPITraceEntry &getEntry(int index) { return entries_[index]; } - /// Allowed only once!!!. - bool setDeviceID(uint32_t devID); /// Get the context device ID. uint32_t getDeviceID() const { return devID_; } /// Returns true if device ID was set, false otherwise. @@ -65,22 +74,17 @@ class NNPITraceContext { /// Get a vector of the loaded entries (valid only after load()). std::vector getEntries() const { return entries_; } - /// Use to sync device and host clocks by flagging the first input copy on the - /// host. - void markInputCopyStart(uint64_t uptime); - private: - bool createContext(); + bool destroyInternalContext(); + bool createInternalContext(); + bool readTraceOutput(std::stringstream &inputStream); nnpimlTraceContext traceCtx_{0}; uint64_t devMask_{0}; - uint32_t devID_{0}; + unsigned devID_{0}; bool devIDSet_{false}; std::string events_; std::vector entries_; - uint64_t upRefTime_{0}; - int64_t timeDiff_{0}; - size_t timeUpdatedIndex_{0}; }; #endif // NNPI_NNPITRACING_ML_WRAPPER_H diff --git a/lib/Backends/NNPI/NNPIOptions.h b/lib/Backends/NNPI/NNPIOptions.h index 62b43c79c4..863666727b 100644 --- a/lib/Backends/NNPI/NNPIOptions.h +++ b/lib/Backends/NNPI/NNPIOptions.h @@ -16,6 +16,7 @@ #ifndef GLOW_NNPI_ENV_VARIABLES_H #define GLOW_NNPI_ENV_VARIABLES_H +#include "NNPIUtils.h" #include "nnpi_transformer_types.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" @@ -27,6 +28,24 @@ namespace glow { +// Return true in case cpuinfo contains flag. +static bool isStringFoundInCpuInfo(const char *flag) { + FILE *cpuinfo = fopen("/proc/cpuinfo", "rb"); + char *arg = nullptr; + size_t size = 0; + bool found = false; + while ((found == false) && (getdelim(&arg, &size, 32, cpuinfo) != -1)) { + if (strncmp(arg, flag, strlen(flag)) == 0) { + found = true; + } + } + if (arg) { + free(arg); + } + fclose(cpuinfo); + return found; +} + /// Parent calls for all NNPI option knobs. class NNPIOptions { public: @@ -179,6 +198,13 @@ class NNPICompilationOptions : public NNPIOptions { "Sets a file name to save the compilation output to the " "filename specified.", "ICE_T_FILE", ""); + /// Use function name for compilation compilation output filename (works only + /// when CompiledFile is not empty). + DECLARE_NNPI_OPTION( + compileOutputPostfix, bool, "compileOutputPostfix", + "Use function name as postfix for compilation output filename (or as the " + "name of the function when CompiledFile option is empty).", + "ICE_T_FILE_POSTFIX", "0"); /// Setting this variable will force compilation to use no more than /// the set amount of ice cores (1-12), -1 for unlimited. DECLARE_NNPI_OPTION( @@ -214,13 +240,14 @@ class NNPICompilationOptions : public NNPIOptions { /// Disable constant folding during compilation. DECLARE_NNPI_OPTION(disableConstFolding, bool, "DisableConstFolding", "Disable constant folding during compilation.", - "NNPI_DISABLE_CONSTFOLD", "1"); + "NNPI_DISABLE_CONSTFOLD", "0"); NNPICompilationOptions(const std::map ¶meters) { INIT_NNPI_OPTIONS(useIceT, parameters); INIT_NNPI_OPTIONS(inferOnDevice, parameters); INIT_NNPI_OPTIONS(showVars, parameters); INIT_NNPI_OPTIONS(compiledFile, parameters); + INIT_NNPI_OPTIONS(compileOutputPostfix, parameters); INIT_NNPI_OPTIONS(iceCores, parameters); INIT_NNPI_OPTIONS(useSymlowp, parameters); INIT_NNPI_OPTIONS(deviceVersion, parameters); @@ -268,7 +295,7 @@ class NNPIDeviceOptions : public NNPIOptions { ); /// Setting this variable will override the target device ID used to run /// (0,1,...). - DECLARE_NNPI_OPTION(deviceID, int, "DeviceID", + DECLARE_NNPI_OPTION(deviceId, int, "DeviceID", "Override the target device ID used to run (0,1,...).", "NNPI_DEVICE_ID", "-1"); /// Setting this variable will override the amount of worker threads allocated @@ -283,7 +310,7 @@ class NNPIDeviceOptions : public NNPIOptions { enabledDeviceTracing, bool, "DeviceTracing", "Enabled device tracing (host2device, device2host copy infer etc.).", "NNPI_DEVICE_TRACING", "0"); - /// Overied the max NNPI device memory. + /// Override the max NNPI device memory. DECLARE_NNPI_OPTION( deviceMemory, unsigned, "DeviceMemory", "Override the amount of DRAM to allocate per NNPI device, in kilobytes.", @@ -300,17 +327,32 @@ class NNPIDeviceOptions : public NNPIOptions { /// Dump IO to files. DECLARE_NNPI_OPTION(dumpIOtoFiles, bool, "DumpIOtoFiles", "Dump Inputs/Outputs to files.", "NNPI_DUMP_IO", "0"); + /// Force using a specific AVX type. + DECLARE_NNPI_OPTION(avxType, int, "avxType", + "Force using a specific AVX type." + "\n 0 = No AVX. " + "\n 1 = Use AVX512. ", + "NNPI_AVX_TYPE", "-1"); NNPIDeviceOptions(const llvm::StringMap ¶meters) { INIT_NNPI_OPTIONS(useIceT, parameters); INIT_NNPI_OPTIONS(inferOnDevice, parameters); INIT_NNPI_OPTIONS(showVars, parameters); - INIT_NNPI_OPTIONS(deviceID, parameters); + INIT_NNPI_OPTIONS(deviceId, parameters); INIT_NNPI_OPTIONS(numWorkers, parameters); INIT_NNPI_OPTIONS(enabledDeviceTracing, parameters); INIT_NNPI_OPTIONS(deviceMemory, parameters); INIT_NNPI_OPTIONS(enabledCommandLists, parameters); INIT_NNPI_OPTIONS(dumpIOtoFiles, parameters); + INIT_NNPI_OPTIONS(avxType, parameters); + + if (avxType == -1) { + if (isStringFoundInCpuInfo("avx512f")) { + avxType.setVal(NNPI_AVX_AVX512); + } else { + avxType.setVal(NNPI_AVX_NONE); + } + } } virtual llvm::StringRef getOptionsName() const override { return "Device Options"; diff --git a/lib/Backends/NNPI/NNPIResource.cpp b/lib/Backends/NNPI/NNPIResource.cpp index 0cc9be644e..26857ab930 100644 --- a/lib/Backends/NNPI/NNPIResource.cpp +++ b/lib/Backends/NNPI/NNPIResource.cpp @@ -14,41 +14,12 @@ */ #include "NNPIResource.h" +#include "NNPIUtils.h" #include "nnpi_inference.h" #include #include #include -#ifdef USE_AVX -#include -static inline void ConvertI64toI32(int64_t const *i64Data, int32_t *i32Data, - uint32_t elements) { - const __mmask8 masks[9] = { - 0b0, 0b1, 0b11, 0b111, 0b1111, 0b11111, 0b111111, 0b1111111, 0b11111111, - }; - constexpr uint32_t vecSize = (sizeof(__m512i) / sizeof(int64_t)); - const uint32_t fullIterations = (elements / vecSize); - const uint32_t tailElements = (elements % vecSize); - - for (uint32_t i = 0; i < fullIterations; i++) { - __m512i i64vec = _mm512_maskz_loadu_epi64(masks[vecSize], i64Data); - _mm512_mask_cvtepi64_storeu_epi32(i32Data, masks[vecSize], i64vec); - i64Data += vecSize; - i32Data += vecSize; - } - if (tailElements > 0) { - __m512i i64vec = _mm512_maskz_loadu_epi64(masks[tailElements], i64Data); - _mm512_mask_cvtepi64_storeu_epi32(i32Data, masks[tailElements], i64vec); - } -} -#else // USE_AVX -static inline void ConvertI64toI32(int64_t const *i64Data, int32_t *i32Data, - uint32_t elements) { - for (size_t i = 0; i < elements; i++) { - i32Data[i] = static_cast(i64Data[i]); - } -} -#endif // USE_AVX static size_t CalcDescSize(const NNPIResourceDesc *desc) { if (desc->numDims == 0) { @@ -147,8 +118,8 @@ NNPIResource::~NNPIResource() { // them but only keeps reference for it's usage. } -bool NNPIResource::Init(const NNPIObjectName name, - const NNPIDeviceOptions *deviceOptions, +bool NNPIResource::init(const NNPIObjectName name, + std::shared_ptr deviceOptions, NNPIAdapter adapter, NNPIDeviceContext device, const NNPIResourceDesc *desc, NNPIResource::ResourceUsage usage) { @@ -243,7 +214,7 @@ NNPIInferenceErrorCode NNPIResource::PreInference(Tensor *t, } // Update the host resource from the tensor content. - UpdateHostResourceFromTensor(t, partialTensor); + updateHostResourceFromTensor(t, partialTensor); if (deviceOptions_->dumpIOtoFiles) { size_t unpaddedSize = t->getUnpaddedSizeInBytes(); @@ -360,94 +331,36 @@ void NNPIResource::UpdateDeviceResourceFromTensor( LOG_AND_FAIL_CALLBACK_IF_NOT( t != nullptr, "Invalid tensor used to update static input", resultCB); - if (deviceOptions_->inferOnDevice) { - // Create host resource (and update hostPtr_). - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR( - nnpiHostResourceCreate(adapter_, &desc_, &hostResource_), - "Failed to create NNPI host resource", resultCB); - - // Lock/Unlock host resource and keep host address. - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR( - nnpiHostResourceLock(hostResource_, NNPI_LOCK_FOR_WRITE, UINT32_MAX, - &hostPtr_), - "Failed to lock host resource", resultCB); - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR(nnpiHostResourceUnlock(hostResource_), - "Failed to unlock host resource", - resultCB); - - // Create copy command. - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR( - nnpiCopyCommandCreateHostToDevice(device_, deviceResource_, - hostResource_, ©Command_), - "Failed to create NNPI copy command (input)", resultCB); - } else { - refStorage_.resize(t->getSizeInBytes()); - hostPtr_ = &(refStorage_.at(0)); - } + LOG_AND_FAIL_CALLBACK_IF_NOT(updateHostResourceFromTensor(t, false), + "Invalid Static placeholder", resultCB); + + LOG_NNPI_INF_IF_ERROR(nnpiDeviceResourceSubLoad(deviceResource_, 0, + t->getUnsafePtr(), + t->getSizeInBytes()), + "Failed to execute device resource sub load"); - LOG_AND_FAIL_CALLBACK_IF_NOT( - t->getSizeInBytes() == t->getUnpaddedSizeInBytes(), - "Static partial tensors are not supported", resultCB); - // Copy data from tensor to host resource (convert if needed). - UpdateHostResourceFromTensor(t, false); - - if (deviceOptions_->inferOnDevice) { - // TODO: move to stream once exposed in the inference. - if (deviceOptions_->enabledCommandLists > 0) { - NNPICommandHandle ch; - ch.type = NNPI_COMMAND_TYPE_COPY; - ch.copyCommand = copyCommand_; - NNPICommandList cl; - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR( - nnpiCommandListCreate(&ch, 1, nullptr, 0, &cl), - "Failed to create NNPI command list", resultCB); - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR(nnpiCommandListQueue(cl, nullptr, 0), - "Failed to queue command list", - resultCB); - uint32_t numErrors = 0; - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR( - nnpiCommandListWait(cl, UINT32_MAX, nullptr, 0, &numErrors), - "Failed to wait on command list completion", resultCB); - LOG_AND_FAIL_CALLBACK_IF_NOT(numErrors == 0, - "Command list returned errors", resultCB); - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR(nnpiCommandListDestroy(cl), - "Failed to destroy command list", - resultCB); - // TODO: dump errors generated in this command list - } else { - // No command lists. - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR( - nnpiCopyCommandQueue(copyCommand_, nullptr), - "Failed to queue input copy command.", resultCB); - // Lock to make sure copy has ended - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR( - nnpiHostResourceLock(hostResource_, NNPI_LOCK_FOR_WRITE, UINT32_MAX, - &hostPtr_), - "Failed to lock host resource", resultCB); - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR(nnpiHostResourceUnlock(hostResource_), - "Failed to unlock host resource", - resultCB); - } - // Destroy host resource, copy command, command list. - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR(nnpiCopyCommandDestroy(copyCommand_), - "Failed to destroy NNPI copy command", - resultCB); - copyCommand_ = NNPI_INVALID_NNPIHANDLE; - LOG_AND_CALLBACK_NNPI_INF_IF_ERROR(nnpiHostResourceDestroy(hostResource_), - "Failed to destroy NNPI host resource", - resultCB); - hostResource_ = NNPI_INVALID_NNPIHANDLE; - hostPtr_ = nullptr; - } resultCB(Error::success()); } -void NNPIResource::UpdateHostResourceFromTensor(Tensor *t, bool partialTensor) { +bool NNPIResource::updateHostResourceFromTensor(Tensor *t, bool partialTensor) { // Prepare data on the host resource (for ice-ref use int32sTorage). char *tensorData = t->getUnsafePtr(); const bool downcastInt64 = t->getElementType() == glow::ElemKind::Int64ITy; size_t paddedSize = t->getSizeInBytes(); size_t unpaddedSize = t->getUnpaddedSizeInBytes(); + const bool partialData = (unpaddedSize != paddedSize); + + if (usage_ == ResourceUsage::StaticInputResource) { + LOG_AND_RETURN_IF(ERROR, downcastInt64, + "Static placeholder not allowed to be of type Int64", + false); + LOG_AND_RETURN_IF(ERROR, partialData, + "Static placeholders are not allowed to do partial copy", + false); + + // nothing else to do for static placeholders. + return true; + } if (downcastInt64) { paddedSize /= 2; @@ -456,15 +369,25 @@ void NNPIResource::UpdateHostResourceFromTensor(Tensor *t, bool partialTensor) { // Copy or convert. if (downcastInt64) { // Convert - ConvertI64toI32(reinterpret_cast(tensorData), - reinterpret_cast(hostPtr_), - unpaddedSize / sizeof(int32_t)); + switch (deviceOptions_->avxType) { + case NNPI_AVX_NONE: + convertI64toI32(reinterpret_cast(tensorData), + reinterpret_cast(hostPtr_), + unpaddedSize / sizeof(int32_t)); + break; + case NNPI_AVX_AVX512: + convertI64toI32_AVX512(reinterpret_cast(tensorData), + reinterpret_cast(hostPtr_), + unpaddedSize / sizeof(int32_t)); + break; + default: + LOG(ERROR) << "Invalid avxType=" << deviceOptions_->avxType; + } } else { // Copy memcpy(hostPtr_, tensorData, unpaddedSize); } // Pad with zeros if needed. - const bool partialData = (unpaddedSize != paddedSize); if (partialData && !partialTensor) { memset(reinterpret_cast(hostPtr_) + unpaddedSize, 0, paddedSize - unpaddedSize); @@ -472,6 +395,8 @@ void NNPIResource::UpdateHostResourceFromTensor(Tensor *t, bool partialTensor) { // Update partial size. partialSize_ = (partialData && partialTensor) ? unpaddedSize : 0; + + return true; } std::string NNPIResource::Dump() const { diff --git a/lib/Backends/NNPI/NNPIResource.h b/lib/Backends/NNPI/NNPIResource.h index 6218a38fc3..905e1279de 100644 --- a/lib/Backends/NNPI/NNPIResource.h +++ b/lib/Backends/NNPI/NNPIResource.h @@ -50,7 +50,8 @@ class NNPIResource { std::function resultCB); /// Initialize a resource. - bool Init(const NNPIObjectName name, const NNPIDeviceOptions *deviceOptions, + bool init(const NNPIObjectName name, + std::shared_ptr deviceOptions, NNPIAdapter adapter, NNPIDeviceContext device, const NNPIResourceDesc *desc, ResourceUsage usage); @@ -92,12 +93,13 @@ class NNPIResource { NNPICopyCommand copyCommand_; uint64_t partialSize_; ResourceUsage usage_; - const NNPIDeviceOptions *deviceOptions_; + std::shared_ptr deviceOptions_; std::vector refStorage_; uint32_t cmdListIdx_; /// Update the owned host resource with data taken from the given tensor. - void UpdateHostResourceFromTensor(Tensor *t, bool partialTensor); + // return true when successfull, false otherwise. + bool updateHostResourceFromTensor(Tensor *t, bool partialTensor); }; } // namespace runtime diff --git a/lib/Backends/NNPI/NNPITracing.cpp b/lib/Backends/NNPI/NNPITracing.cpp old mode 100644 new mode 100755 index fb2422af43..40281fd2f0 --- a/lib/Backends/NNPI/NNPITracing.cpp +++ b/lib/Backends/NNPI/NNPITracing.cpp @@ -21,31 +21,35 @@ using namespace glow; -NNPIDeviceTracing::NNPIDeviceTracing(uint32_t deviceID) : deviceID_(deviceID) { - traceCtx_ = glow::make_unique(0); +NNPIDeviceTracing::NNPIDeviceTracing(unsigned deviceID) { + traceCtx_ = glow::make_unique(deviceID); + deviceInfo_ = + std::string("[Device #") + std::to_string(deviceID) + std::string("] "); } -void NNPIDeviceTracing::start(TraceContext *traceContext, - runtime::RunIdentifierTy runId) { +bool NNPIDeviceTracing::start(TraceContext *traceContext, + NNPIDeviceContext deviceContext) { if (!traceContext || !traceContext->shouldLog(TraceEvent::TraceLevel::OPERATOR)) { - return; + return false; } if (started_.test_and_set()) { ASSERT_WITH_MSG(glowTraceCtx_ != traceContext, "Trying to start tracing for an already started context."); // Trace already started. - return; + return false; } glowTraceCtx_ = traceContext; - runId_ = runId; - if (!traceCtx_->startCapture()) { + if (!traceCtx_->startCapture(deviceContext)) { LOG(WARNING) << "Failed to start trace capture"; + return false; } + return true; } std::string NNPIDeviceTracing::getEntryName(NNPITraceEntry &entry) { std::stringstream name; + name << deviceInfo_; switch (entry.traceType) { case NNPI_TRACE_UNKNOWN: name << "UnknownTrace"; @@ -65,6 +69,30 @@ std::string NNPIDeviceTracing::getEntryName(NNPITraceEntry &entry) { case NNPI_TRACE_CLOCK_SYNC: name << "ClockSync"; break; + case NNPI_TRACE_CMDLIST: + name << "CommandList"; + break; + case NNPI_TRACE_NETEXEC: + name << "NetExecute"; + break; + case NNPI_TRACE_SUBGRAPH: + name << "SubGraph"; + break; + case NNPI_TRACE_RUNTIME_INFER: + name << "RunTimeInf"; + break; + case NNPI_TRACE_ICED_SCHED_JOB: + name << "DSchedJob"; + break; + case NNPI_TARCE_ICED_CREAT_NET: + name << "DCreateNet"; + break; + case NNPI_TARCE_ICED_NET_RES: + name << "DNetRes"; + break; + case NNPI_TARCE_ICED_NET_GEN: + name << "DNetGen"; + break; default: name << "Othertrace"; } @@ -76,14 +104,35 @@ std::string NNPIDeviceTracing::getEntryName(NNPITraceEntry &entry) { } } auto params = entry.params; + if (entry.params.count("iceId") > 0) { + name << "-ICE_" << entry.params["iceId"]; + } + if (entry.params.count("netID") > 0) { + name << "-NET_" << entry.params["netID"]; + } + if (entry.params.count("reqID") > 0) { + name << "REQ_" << entry.params["reqID"]; + } if (entry.params.count("ctxID") > 0) { - name << "-" << entry.params["ctxID"]; + name << "-CTX_" << entry.params["ctxID"]; + } + if (entry.params.count("subNetId") > 0) { + name << "-SUBNET_" << entry.params["subNetId"]; + } + if (entry.params.count("inferID") > 0) { + name << "-INFR_" << entry.params["inferID"]; + } + if (entry.params.count("subGraphID") > 0) { + name << "-SUBGRAPH_" << entry.params["subGraphID"]; + } + if (entry.params.count("agent") > 0) { + name << "-AGENT_" << entry.params["agent"]; } if (entry.params.count("copyID") > 0) { - name << "-" << entry.params["copyID"]; + name << "-CPID_" << entry.params["copyID"]; } if (entry.params.count("size") > 0) { - name << "-" << entry.params["size"]; + name << "-SIZE_" << entry.params["size"]; } return name.str(); } @@ -91,19 +140,26 @@ std::string NNPIDeviceTracing::getEntryName(NNPITraceEntry &entry) { bool NNPIDeviceTracing::addTrace(NNPITraceEntry &entry) { // Filter traces. switch (entry.traceType) { - case NNPI_TRACE_UNKNOWN: - return false; - case NNPI_TRACE_DMA: - return false; case NNPI_TRACE_INFER: - break; case NNPI_TRACE_COPY: + case NNPI_TRACE_CMDLIST: + case NNPI_TRACE_NETEXEC: + case NNPI_TRACE_SUBGRAPH: + case NNPI_TRACE_RUNTIME_INFER: + case NNPI_TRACE_ICED_SCHED_JOB: + case NNPI_TARCE_ICED_CREAT_NET: + case NNPI_TARCE_ICED_NET_RES: + case NNPI_TARCE_ICED_NET_GEN: break; + case NNPI_TRACE_UNKNOWN: + case NNPI_TRACE_DMA: case NNPI_TRACE_MARK: - return false; case NNPI_TRACE_CLOCK_SYNC: + case NNPI_TARCE_TIME_SYNC: + case NNPI_TARCE_USER_DATA: return false; default: + LOG(WARNING) << "Trying to add unsupported trace type:" << entry.traceType; return false; } @@ -117,56 +173,53 @@ bool NNPIDeviceTracing::addTrace(NNPITraceEntry &entry) { if (state == "q" || state == "queued") { name += "-Queue"; glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::InstantType, entry.hostTime, - entry.params); + TraceEvent::InstantType, entry.hostTime, {}); } else if (state == "s" || state == "cbs" || state == "executed") { glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::BeginType, entry.hostTime, - entry.params); + TraceEvent::BeginType, entry.hostTime, {}); } else if (state == "c" || state == "cbc" || state == "completed") { glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::EndType, entry.hostTime, - entry.params); + TraceEvent::EndType, entry.hostTime, {}); } else if (state == "cbs") { glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::BeginType, entry.hostTime, - entry.params); + TraceEvent::BeginType, entry.hostTime, {}); } else if (state == "cbc") { glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::EndType, entry.hostTime, - entry.params); + TraceEvent::EndType, entry.hostTime, {}); } else if (state == "cbnwc") { glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, - TraceEvent::InstantType, entry.hostTime, - entry.params); + TraceEvent::InstantType, entry.hostTime, {}); + } else if (state == "req") { + name += "-Req"; + glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR, + TraceEvent::InstantType, entry.hostTime, {}); } + return true; } -void NNPIDeviceTracing::stopAndUpdate(TraceContext *traceContext, - runtime::RunIdentifierTy runId) { - if (glowTraceCtx_ != traceContext || runId_ != runId) { +bool NNPIDeviceTracing::stopAndUpdate(TraceContext *traceContext, + NNPIDeviceContext deviceContext) { + if (glowTraceCtx_ != + nullptr && // For null glowTraceCtx assume global context (per device) + (glowTraceCtx_ != traceContext)) { // Ignore stop from other contexts. - return; + return false; } - if (!traceCtx_->stopCapture()) { + if (!traceCtx_->stopCapture(deviceContext)) { LOG(WARNING) << "Failed to stop trace capture"; - return; + return false; } if (!traceCtx_->load()) { LOG(WARNING) << "Failed to stop trace capture"; - return; + return false; } + traceContext->setThreadName("NNPI_Trace"); for (auto entry : traceCtx_->getEntries()) { std::map params = entry.params; addTrace(entry); } started_.clear(); -} - -void NNPIDeviceTracing::startCopyTime() { - if (traceCtx_) { - traceCtx_->markInputCopyStart(TraceEvent::now()); - } + return true; } diff --git a/lib/Backends/NNPI/NNPITracing.h b/lib/Backends/NNPI/NNPITracing.h old mode 100644 new mode 100755 index e905900331..7a1872b05a --- a/lib/Backends/NNPI/NNPITracing.h +++ b/lib/Backends/NNPI/NNPITracing.h @@ -30,30 +30,28 @@ namespace glow { class NNPIDeviceTracing { public: - static std::shared_ptr getForDevice(uint32_t deviceID) { + static std::shared_ptr getForDevice(unsigned deviceId) { static std::unordered_map> map; static std::mutex mapSyncMutex; std::lock_guard lk(mapSyncMutex); - if (map.count(deviceID) <= 0) { + if (map.count(deviceId) <= 0) { // Stub to allow make_shared access to private constructor. struct EnabledShare : public NNPIDeviceTracing { - EnabledShare(uint32_t deviceID) : NNPIDeviceTracing(deviceID) {} + EnabledShare(uint32_t deviceId) : NNPIDeviceTracing(deviceId) {} }; - map[deviceID] = std::make_shared(deviceID); + map[deviceId] = std::make_shared(deviceId); } - return map[deviceID]; + return map[deviceId]; } /// Dispose of tracing context. virtual ~NNPIDeviceTracing(){}; /// Start recording events. - void start(TraceContext *traceContext, runtime::RunIdentifierTy runId); + bool start(TraceContext *traceContext, NNPIDeviceContext deviceContext); /// Stop recording, read and update trace context. - void stopAndUpdate(TraceContext *traceContext, - runtime::RunIdentifierTy runId); - /// Start copy. - void startCopyTime(); + bool stopAndUpdate(TraceContext *traceContext, + NNPIDeviceContext deviceContext); protected: std::string getEntryName(NNPITraceEntry &entry); @@ -61,18 +59,17 @@ class NNPIDeviceTracing { private: /// Per device tracing control. - explicit NNPIDeviceTracing(uint32_t deviceID); + explicit NNPIDeviceTracing(unsigned deviceId); /// Glow trace context. Used to identify start/stop and log traces (with /// runId_). TraceContext *glowTraceCtx_{nullptr}; - /// Run identifier. Used to identify start/stop and log traces (with - /// glowTraceCtx_). - runtime::RunIdentifierTy runId_{0}; std::atomic_flag started_{false}; /// NNPI Trace context. std::unique_ptr traceCtx_; /// Device id. - uint32_t deviceID_{0}; + unsigned deviceId_{0}; + /// Device id string prefix for event names. + std::string deviceInfo_; }; } // namespace glow diff --git a/lib/Backends/NNPI/NNPIUtils.h b/lib/Backends/NNPI/NNPIUtils.h new file mode 100644 index 0000000000..afd1dd9950 --- /dev/null +++ b/lib/Backends/NNPI/NNPIUtils.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) Glow Contributors. See CONTRIBUTORS file. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GLOW_BACKENDS_NNPI_NNPIUTILS_H +#define GLOW_BACKENDS_NNPI_NNPIUTILS_H + +#include + +using namespace std; + +enum NNPIAVXType { NNPI_AVX_NONE = 0, NNPI_AVX_AVX512 }; + +inline void convertI64toI32(int64_t const *i64Data, int32_t *i32Data, + uint32_t elements) { + for (size_t i = 0; i < elements; i++) { + i32Data[i] = static_cast(i64Data[i]); + } +} +void convertI64toI32_AVX512(int64_t const *i64Data, int32_t *i32Data, + uint32_t elements); + +#endif // GLOW_BACKENDS_NNPI_NNPIUTILS_H diff --git a/lib/Backends/NNPI/NNPIUtils_AVX512.cpp b/lib/Backends/NNPI/NNPIUtils_AVX512.cpp new file mode 100644 index 0000000000..a37bcd32d1 --- /dev/null +++ b/lib/Backends/NNPI/NNPIUtils_AVX512.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) Glow Contributors. See CONTRIBUTORS file. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "NNPIUtils.h" +#include + +void convertI64toI32_AVX512(int64_t const *i64Data, int32_t *i32Data, + uint32_t elements) { + const __mmask8 masks[9] = { + 0b0, 0b1, 0b11, 0b111, 0b1111, 0b11111, 0b111111, 0b1111111, 0b11111111, + }; + constexpr uint32_t vecSize = (sizeof(__m512i) / sizeof(int64_t)); + const uint32_t fullIterations = (elements / vecSize); + const uint32_t tailElements = (elements % vecSize); + + for (uint32_t i = 0; i < fullIterations; i++) { + __m512i i64vec = _mm512_maskz_loadu_epi64(masks[vecSize], i64Data); + _mm512_mask_cvtepi64_storeu_epi32(i32Data, masks[vecSize], i64vec); + i64Data += vecSize; + i32Data += vecSize; + } + if (tailElements > 0) { + __m512i i64vec = _mm512_maskz_loadu_epi64(masks[tailElements], i64Data); + _mm512_mask_cvtepi64_storeu_epi32(i32Data, masks[tailElements], i64vec); + } +} \ No newline at end of file diff --git a/lib/Backends/NNPI/tests/NNPIDeviceManagerTest.cpp b/lib/Backends/NNPI/tests/NNPIDeviceManagerTest.cpp index 15e789a8d7..df8d839dac 100644 --- a/lib/Backends/NNPI/tests/NNPIDeviceManagerTest.cpp +++ b/lib/Backends/NNPI/tests/NNPIDeviceManagerTest.cpp @@ -24,7 +24,6 @@ struct BlacklistInitializer { const std::vector> testBlacklistedSetups = { {"MultiFunction/0", TestBlacklist::AnyDeviceAnyEngine}, - {"MultiRun/0", TestBlacklist::AnyDeviceAnyEngine}, {"DeviceResidentTensors/0", TestBlacklist::AnyDeviceAnyEngine}, {"AvailableMemory/0", TestBlacklist::AnyDeviceAnyEngine}, {"TransferStaticPlaceholderTest/0", diff --git a/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp b/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp index def8ca424b..0642d5378c 100644 --- a/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp +++ b/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp @@ -23,8 +23,6 @@ struct BlacklistInitializer { BlacklistInitializer() { const std::vector> testBlacklistedSetups = { - {"AdaptiveAvgPool/0", TestBlacklist::AnyDeviceAnyEngine}, - {"AdaptiveAvgPoolNonSquare/0", TestBlacklist::AnyDeviceAnyEngine}, {"add_int32/0", TestBlacklist::AnyDeviceHWEngine}, {"add_int64/0", TestBlacklist::AnyDeviceHWEngine}, {"batchedPairwiseDotProduct/0", TestBlacklist::AnyDeviceAnyEngine}, @@ -68,7 +66,6 @@ struct BlacklistInitializer { {"FloatMaxPoolWithArgmax/0", TestBlacklist::AnyDeviceAnyEngine}, {"FloatMaxPoolWithArgmaxTransposed/0", TestBlacklist::AnyDeviceAnyEngine}, - {"FP16AdaptiveAvgPool/0", TestBlacklist::AnyDeviceAnyEngine}, {"FullyConnected_Int16_BiasInt16/0", TestBlacklist::AnyDeviceAnyEngine}, {"FullyConnected_Int16_BiasInt32/0", @@ -87,7 +84,6 @@ struct BlacklistInitializer { TestBlacklist::AnyDeviceAnyEngine}, {"Int16ConvolutionDepth10/0", TestBlacklist::AnyDeviceAnyEngine}, {"Int16ConvolutionDepth8/0", TestBlacklist::AnyDeviceAnyEngine}, - {"Int8AdaptiveAvgPool/0", TestBlacklist::AnyDeviceAnyEngine}, {"IntLookupTable/0", TestBlacklist::AnyDeviceAnyEngine}, {"CumSum_Float/0", TestBlacklist::AnyDeviceAnyEngine}, {"CumSum_Float16/0", TestBlacklist::AnyDeviceAnyEngine}, @@ -151,20 +147,6 @@ struct BlacklistInitializer { TestBlacklist::AnyDeviceAnyEngine}, {"EmbeddingBagByteRowwiseOffsets_Float16_AccumFloat16/0", TestBlacklist::AnyDeviceAnyEngine}, - {"EmbeddingBag_1D_Float_End_Offset/0", - TestBlacklist::AnyDeviceAnyEngine}, - {"EmbeddingBag_1D_Float16_End_Offset/0", - TestBlacklist::AnyDeviceAnyEngine}, - {"EmbeddingBag_2D_Float_End_Offset/0", - TestBlacklist::AnyDeviceAnyEngine}, - {"EmbeddingBag_2D_Float16_End_Offset/0", - TestBlacklist::AnyDeviceAnyEngine}, - {"EmbeddingBagByteRowwiseOffsets_Float_End_Offset/0", - TestBlacklist::AnyDeviceAnyEngine}, - {"EmbeddingBagByteRowwiseOffsets_Float16_AccumFloat_End_Offset/0", - TestBlacklist::AnyDeviceAnyEngine}, - {"EmbeddingBagByteRowwiseOffsets_Float16_AccumFloat16_End_Offset/0", - TestBlacklist::AnyDeviceAnyEngine}, {"SparseToDense/0", TestBlacklist::AnyDeviceAnyEngine}, {"SparseToDenseMask1/0", TestBlacklist::AnyDeviceAnyEngine}, {"SparseToDenseMask2/0", TestBlacklist::AnyDeviceAnyEngine},