From 4649677cd08d3de88fb244a01ff66ecc71fe519b Mon Sep 17 00:00:00 2001 From: Shiyan Deng Date: Fri, 14 May 2021 09:59:30 -0700 Subject: [PATCH] Add Support for importing quantized linear in FXIRImporter (#57483) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/57483 Pull Request resolved: https://github.com/pytorch/glow/pull/5622 Quantized linear has packed parameters. We want to unpack it so that it would be easier for graph optimization and importer to deal with the weight and bias. A customized remapping function is used to unpack quantized linear and map it to acc_op.linear. Reviewed By: gcatron, jfix71, khabinov Differential Revision: D27451237 fbshipit-source-id: d8fd32321f6f4450731e32e1f56a91228484a9a4 --- lib/Backends/NNPI/FXIRImporter.cpp | 163 ++++++++++++++++++----------- lib/Backends/NNPI/FXIRImporter.h | 23 ++-- 2 files changed, 116 insertions(+), 70 deletions(-) diff --git a/lib/Backends/NNPI/FXIRImporter.cpp b/lib/Backends/NNPI/FXIRImporter.cpp index ab6894a3ca..9bfe40744e 100644 --- a/lib/Backends/NNPI/FXIRImporter.cpp +++ b/lib/Backends/NNPI/FXIRImporter.cpp @@ -412,6 +412,7 @@ static std::unordered_map< std::make_unique>()}, {"acc_ops.reshape", std::make_unique()}, {"acc_ops.linear", std::make_unique()}, + {"acc_ops.quantized_linear", std::make_unique()}, {"acc_ops.conv2d", std::make_unique>()}, {"acc_ops.batch_norm", std::make_unique()}, {"acc_ops.relu", std::make_unique()}, @@ -505,14 +506,14 @@ const std::string &FXNNPIImporter::getInputNodeName(const folly::dynamic &node, void FXNNPIImporter::updateDescQuantFromFX( const DTYPE &dtype, NNPITensorDesc &desc, const float &scale, const int32_t &offset, const std::string &scaleTensor, - const std::string &offsetTensor, bool forceSymlowp) { + const std::string &offsetTensor, bool forceSymlowp, bool zeroOffset) { desc.quantParams.params.gemlowp.scale = scale; desc.quantParams.params.gemlowp.offset = offset; switch (dtype) { case DTYPE::FLOAT32: LOG_ERROR_IF_NOT((scaleTensor.empty() && offsetTensor.empty())) - << "Scales and offsets provided for Float"; + << "Scales and offsets provided for Float32"; desc.quantParams.precision = NNPI_PRECISION_FLOAT32; desc.quantParams.type = NNPI_QUANTIZATION_NONE; break; @@ -522,38 +523,63 @@ void FXNNPIImporter::updateDescQuantFromFX( desc.quantParams.precision = NNPI_PRECISION_FLOAT16; desc.quantParams.type = NNPI_QUANTIZATION_NONE; break; + case DTYPE::INT32: case DTYPE::INT64: LOG_ERROR_IF_NOT((scaleTensor.empty() && offsetTensor.empty())) - << "Scales and offsets provided for Int64"; + << "Scales and offsets provided for Int64 or Int32"; desc.quantParams.precision = NNPI_PRECISION_INT32; desc.quantParams.type = NNPI_QUANTIZATION_NONE; break; case DTYPE::QINT8: - LOG_ERROR_IF_NOT((scaleTensor.empty() && offsetTensor.empty())) - << "Don't support PCQ yet"; desc.quantParams.precision = NNPI_PRECISION_INT8; - desc.quantParams.type = NNPI_QUANTIZATION_GEMMLOWP; - if (forceSymlowp) { - LOG_ERROR_IF_NOT(offset == 0) << "Offset is not 0 when forcing symlowp"; - desc.quantParams.type = NNPI_QUANTIZATION_SYMLOWP; - desc.quantParams.params.symlowp.scale = scale; + + // If we have scales tensor, this is PCQ case. + if (!scaleTensor.empty()) { + LOG_ERROR_IF_NOT(!forceSymlowp || zeroOffset) + << "Offset is not 0 when forcing symlowp"; + // If there is no offsets, or Symlowp workaround is used and all offsets + // are zero, the quantization type is SYMLOWP_PCQ. + if (offsetTensor.empty() || (forceSymlowp && zeroOffset)) { + desc.quantParams.type = NNPI_QUANTIZATION_SYMLOWP_PCQ; + std::strncpy(desc.quantParams.params.symlowpPCQ.scalesTensor, + scaleTensor.c_str(), NNPI_MAX_STRING_LEN - 1); + } else { // Both scales and offsets are present. + desc.quantParams.type = NNPI_QUANTIZATION_GEMMLOWP_PCQ; + std::strncpy(desc.quantParams.params.gemmlowpPCQ.scalesTensor, + scaleTensor.c_str(), NNPI_MAX_STRING_LEN - 1); + std::strncpy(desc.quantParams.params.gemmlowpPCQ.offsetsTensor, + offsetTensor.c_str(), NNPI_MAX_STRING_LEN - 1); + } + } else { + desc.quantParams.type = NNPI_QUANTIZATION_GEMMLOWP; + if (forceSymlowp && zeroOffset) { + desc.quantParams.type = NNPI_QUANTIZATION_SYMLOWP; + desc.quantParams.params.symlowp.scale = scale; + } } break; case DTYPE::QUINT8: - LOG_ERROR_IF_NOT((scaleTensor.empty() && offsetTensor.empty())) - << "Don't support PCQ yet"; desc.quantParams.precision = NNPI_PRECISION_UINT8; - desc.quantParams.type = NNPI_QUANTIZATION_GEMMLOWP; - desc.quantParams.params.gemlowp.scale = scale; - desc.quantParams.params.gemlowp.offset = offset; + if (!scaleTensor.empty()) { + desc.quantParams.type = NNPI_QUANTIZATION_GEMMLOWP_PCQ; + std::strncpy( + desc.quantParams.params.gemmlowpPCQ.scalesTensor, scaleTensor.c_str(), + sizeof(desc.quantParams.params.gemmlowpPCQ.scalesTensor) - 1); + std::strncpy(desc.quantParams.params.gemmlowpPCQ.offsetsTensor, + offsetTensor.c_str(), NNPI_MAX_STRING_LEN - 1); + } else { + desc.quantParams.type = NNPI_QUANTIZATION_GEMMLOWP; + desc.quantParams.params.gemlowp.scale = scale; + desc.quantParams.params.gemlowp.offset = offset; + } break; default: LOG(FATAL) << "Unhandled tensor data type"; } } -void FXNNPIImporter::updateDescDimsFromFX( - const llvm::ArrayRef &dims, NNPITensorDesc &desc) { +void FXNNPIImporter::updateDescDimsFromFX(llvm::ArrayRef dims, + NNPITensorDesc &desc) { desc.numDims = dims.size(); for (size_t d = 0; d < desc.numDims; d++) { desc.dims[d] = dims[d]; @@ -579,12 +605,11 @@ void FXNNPIImporter::updateDescDimsFromFX( } } -NNPIErrorCode -FXNNPIImporter::addTensor(const std::string &name, const string &dtypeStr, - const llvm::ArrayRef dims, bool input, - bool output, const float &scale, - const int32_t &offset, const std::string &scaleTensor, - const std::string &offsetTensor, bool forceSymlowp) { +NNPIErrorCode FXNNPIImporter::addTensor( + const std::string &name, const string &dtypeStr, + llvm::ArrayRef dims, bool input, bool output, + const float &scale, const int32_t &offset, const std::string &scaleTensor, + const std::string &offsetTensor, bool forceSymlowp, bool zeroOffset) { const auto &dtypeElt = stringToDTYPE.find(dtypeStr); LOG_ERROR_IF_NOT(dtypeElt != stringToDTYPE.end()) << dtypeStr << " is not supported!"; @@ -603,7 +628,7 @@ FXNNPIImporter::addTensor(const std::string &name, const string &dtypeStr, desc.attributes.input = input; desc.attributes.output = output; updateDescQuantFromFX(dtype, desc, scale, offset, scaleTensor, offsetTensor, - forceSymlowp || compileOptions_.useSymlowp); + forceSymlowp || compileOptions_.useSymlowp, zeroOffset); updateDescDimsFromFX(dims, desc); const void *pRawData = getConstant(name); @@ -638,27 +663,56 @@ FXNNPIImporter::addTensor(const std::string &name, const string &dtypeStr, return nnpiNetworkAddTensor(network_, finalize(name), &desc, pRawData); } -NNPIErrorCode FXNNPIImporter::addTensor(const folly::dynamic &node, bool input, - bool output, bool forceSymlowp) { +bool FXNNPIImporter::isZeroes(const std::string &name, const DTYPE &dtype, + const size_t &size) const { + const auto *t = getConstant(name); + CHECK(t) << "Can't find constant with name " << name; + + switch (dtype) { + case DTYPE::INT32: { + const auto *pDataInt32 = static_cast(t); + return std::all_of(pDataInt32, pDataInt32 + size, + [](int32_t x) { return x == 0; }); + } + default: + return false; + } +} + +NNPIErrorCode FXNNPIImporter::addTensor(const std::string &name, + const folly::dynamic &node, bool input, + bool output) { + const auto &dims = toIntegerArray(node["shape"].getString()); + bool zeroOffset = false; + bool forceSymlowp = false; float scale = 1.0f; int32_t zero_point = 0; + std::string scaleTensor; + std::string offsetTensor; if (node["is_quantized"].getBool()) { - CHECK(node.count("q_scale")) << "Missing key q_scale for node " + - node["name"].getString() + - ", this probably due to node having " - "per channel quantized output."; + forceSymlowp = node["dtype"].getString() == "torch.qint8"; - scale = node["q_scale"].getDouble(); - zero_point = node["q_zero_point"].getInt(); + if (node["qscheme"].getString().find("per_tensor") != std::string::npos) { + scale = node["q_scale"].getDouble(); + zero_point = node["q_zero_point"].getInt(); + zeroOffset = zero_point == 0; + } else { + scaleTensor = node["q_per_channel_scales"].getString(); + offsetTensor = node["q_per_channel_zero_points"].getString(); + zeroOffset = + isZeroes(offsetTensor, /* dtype */ DTYPE::INT32, + /* size */ dims[node["q_per_channel_axis"].getInt()]); + } } - return addTensor(node["name"].getString(), node["dtype"].getString(), - toIntegerArray(node["shape"].getString()), + return addTensor(name, node["dtype"].getString(), /* dims */ dims, /* input */ input, /* output */ output, /* scale */ scale, - /* offset */ zero_point, /* scaleTensor */ {}, - /* offsetTensor */ {}, /* forceSymlowp */ forceSymlowp); + /* offset */ zero_point, /* scaleTensor */ scaleTensor, + /* offsetTensor */ offsetTensor, + /* forceSymlowp */ forceSymlowp, + /* zeroOffset */ zeroOffset); } void FXNNPIImporter::logUnsupportedNodes(const folly::dynamic &mod) { @@ -697,28 +751,11 @@ NNPINetwork FXNNPIImporter::importFunction(const folly::dynamic &FXIR, const auto &weights = mod["weights"]; for (const auto &key : weights.keys()) { const auto &name = key.getString(); + const auto &weight = weights[name]; DBG("Importing Constant: " << name); CHECK(constants_.count(name)) << "Constant not found for weight " << name; - - if (weights[name]["is_quantized"].getBool()) { - // TODO: Add support of PCQ. - CHECK(weights[name].count("q_scale")) - << "We only support PTQ now, weight " + name + " is PCQ."; - LOG_NNPI_IF_ERROR_RETURN_INVALID_HANDLE( - addTensor( - name, weights[name]["dtype"].getString(), - toIntegerArray(weights[name]["shape"].getString()), - /* input */ false, /* output */ false, - /* scale */ weights[name]["q_scale"].getDouble(), - /* offset */ weights[name]["q_zero_point"].getInt()), - "Failed to add intermediate"); - } else { - LOG_NNPI_IF_ERROR_RETURN_INVALID_HANDLE( - addTensor( - name, weights[name]["dtype"].getString(), - toIntegerArray(weights[name]["shape"].getString())), - "Failed to add intermediate"); - } + LOG_NNPI_IF_ERROR_RETURN_INVALID_HANDLE(addTensor(name, weight), + "Failed to add constant"); } // Add ops node. @@ -732,10 +769,6 @@ NNPINetwork FXNNPIImporter::importFunction(const folly::dynamic &FXIR, } DBG("Importing Node: " << nodeName); - // Add node outputs. - LOG_NNPI_IF_ERROR_RETURN_INVALID_HANDLE(addTensor(node), - "Failed to add intermediate"); - // Track what Constant each get_attr points to. if (opCode == "get_attr") { bool inserted = @@ -744,6 +777,12 @@ NNPINetwork FXNNPIImporter::importFunction(const folly::dynamic &FXIR, << " to its underlying Constant"; continue; } + + // Add node outputs. We don't add get_attr node output because they have + // been added when adding constants. + LOG_NNPI_IF_ERROR_RETURN_INVALID_HANDLE(addTensor(nodeName, node), + "Failed to add intermediate"); + const auto &targetName = node["target"].getString(); const auto &functionName = opCode != "call_module" ? targetName @@ -781,7 +820,7 @@ NNPINetwork FXNNPIImporter::importFunction(const folly::dynamic &FXIR, CHECK(!writeTensors_.count(name)) << "Placeholder can't be written"; if (readTensors_.count(name)) { - LOG_NNPI_IF_ERROR_RETURN_INVALID_HANDLE(addTensor(node, + LOG_NNPI_IF_ERROR_RETURN_INVALID_HANDLE(addTensor(name, node, /* input */ true, /* output */ false), "Failed to add placeholder"); @@ -798,7 +837,7 @@ NNPINetwork FXNNPIImporter::importFunction(const folly::dynamic &FXIR, CHECK(writeTensors_.count(outputName)) << "output must be in writeTensors_"; - LOG_NNPI_IF_ERROR_RETURN_INVALID_HANDLE(addTensor(arg, + LOG_NNPI_IF_ERROR_RETURN_INVALID_HANDLE(addTensor(outputName, arg, /* input */ false, /* output */ true), "Failed to add output"); diff --git a/lib/Backends/NNPI/FXIRImporter.h b/lib/Backends/NNPI/FXIRImporter.h index 79b09e90f3..6c8f4d76c3 100644 --- a/lib/Backends/NNPI/FXIRImporter.h +++ b/lib/Backends/NNPI/FXIRImporter.h @@ -54,16 +54,16 @@ class FXNNPIImporter { /// Add Tensor to the network by parameters. NNPIErrorCode addTensor(const std::string &name, const string &dtypeStr, - const llvm::ArrayRef dims, - bool input = false, bool output = false, - const float &scale = 1.f, const int32_t &offset = 0, + llvm::ArrayRef dims, bool input = false, + bool output = false, const float &scale = 1.f, + const int32_t &offset = 0, const std::string &scaleTensor = {}, const std::string &offsetTensor = {}, - bool forceSymlowp = false); + bool forceSymlowp = false, bool zeroOffset = false); /// Add Tensor to the network by node. - NNPIErrorCode addTensor(const folly::dynamic &node, bool input = false, - bool output = false, bool forceSymlowp = false); + NNPIErrorCode addTensor(const std::string &name, const folly::dynamic &node, + bool input = false, bool output = false); /// Set given tensor names as inputs/outputs. void @@ -80,7 +80,7 @@ class FXNNPIImporter { } /// Update the NNPITensorDesc \p desc by the dimensions array \p dims. - static void updateDescDimsFromFX(const llvm::ArrayRef &dims, + static void updateDescDimsFromFX(llvm::ArrayRef dims, NNPITensorDesc &desc); /// Update the NNPITensorDesc \p desc quantization params by \p dtype. @@ -89,7 +89,8 @@ class FXNNPIImporter { const int32_t &offset = 0, const std::string &scaleTensor = {}, const std::string &offsetTensor = {}, - bool forceSymlowp = false); + bool forceSymlowp = false, + bool zeroOffset = false); /// \returns whether there is a Constant known by \p name. Does not look /// through getattr aliases. @@ -116,6 +117,12 @@ class FXNNPIImporter { const std::string &getInputNodeName(const folly::dynamic &node, bool optional = false) const; + /// \returns whether the constant with the given \p name contains only zero. + /// \p dtype is the type of this constant and \p size is the total size of the + /// constant; + bool isZeroes(const std::string &name, const utils::DTYPE &dtype, + const size_t &size) const; + private: /// NNPI network handle. NNPINetwork network_;