[ONNXModelLoader] Enabling operator instance based mixed precision su…

…pport
pytorch · Jan 27, 2021 · b5a17e5 · b5a17e5
1 parent d015e12
commit b5a17e5
Show file tree

Hide file tree

Showing 20 changed files with 1,105 additions and 15 deletions.
diff --git a/docs/ModelLoaderPrecisionConfiguration.md b/docs/ModelLoaderPrecisionConfiguration.md
@@ -0,0 +1,44 @@
+## GLOW Model loader precision configuration
+
+This document describes the mixed precision feature which enables the network
+to be able to run with the combination of operators in float, float16_t and int8
+precision.
+
+### Overview
+
+Glow has following two options along with quantization to set precision of operators
+
+`convert-to-fp16` - Allows running all floating point operations in fp16.
+`keep-original-precision-for-nodes` - Allows running certain node kinds not to be
+quantized and run in the orginal precision.
+
+Note that the above two options are node kind based. In order to run specific instances
+of operators in fp16 precision `-node-precision-info` option can be used to indicate
+execution of specific nodes in fp16. The nodes to run in fp16 are specified by the name of
+the first output in a yaml file.
+
+`-node-precision-info` option can be passed along with `-load-profile` option  of building
+quantized models. In such case operators not mentioned in `-node-precision-info` will run
+in quantized precision (If supported by backend)
+
+### Design details
+
+#### `-node-precision-info` yaml schema
+
+Precision profile can be created with a list of output names of nodes required to run
+in fp16 as shown below.
+
+  ```
+    FP16NodeInstanceNames: [109, 110, 111, 112, 237]
+  ```
+
+#### How to use mixed precision feature
+Generate quantization profile using the following command
+```
+./bin/image-classifier tests/images/imagenet/*.png -image-mode=0to1 -m=resnet50 -model-input-name=gpu_0/data -dump-profile="profile.yaml" -node-precision-info="precision_profile.yaml"
+```
+
+Use the generated quantization profile from above command along with `-node-precision-info` to run the network in mixed precision
+```
+./bin/image-classifier tests/images/imagenet/*.png -image-mode=0to1 -m=resnet50 -model-input-name=gpu_0/data -load-profile="profile.yaml" -node-precision-info="precision_profile.yaml"
+```
diff --git a/docs/Optimizations.md b/docs/Optimizations.md
@@ -160,6 +160,22 @@ But in addition to those there are quantization specific optimizations:
     possible value from the operand can be calculated based on the quantization
     parameters which represent quantization range [min, max] in fp32.
 
+  * Quantize(ConvertTo(X)) -> Quantize(X)
+
+    A sequence of ConvertTo operation followed by Quantize operation
+    is replaced by a Quantize operation.
+
+  * ConvertTo(Dequantize(X)) -> Dequantize(X)
+
+    A sequence of Dequantize operation followed by ConvertTo operation
+    is replaced by a Dequanize operation.
+
+  * Dequantize(Quantize(X))
+
+    A sequence of Quantize operation followed by Dequantize operation can be
+    replaced with ConvertTo operation if there is mismatch in element types
+    of Quantize node input and Dequantize node output.
+
 #### Configuring a graph optimization pipeline
 
 The graph optimizations listed above are each formulated as a FunctionPass,

diff --git a/include/glow/Importer/CommonOperatorLoader.h b/include/glow/Importer/CommonOperatorLoader.h
@@ -396,7 +396,18 @@ class CommonOperatorLoader : public ProtobufLoader {
                       "Can't register more than outputs in the operation.");
     numOutputs = (numOutputs < 0) ? op.output_size() : numOutputs;
     for (int i = 0; i < numOutputs; i++) {
-      nodeValueByName_[op.output(i)] = NodeValue(node, i);
+      // If output name is specified to run in FP16 precision and output type
+      // is fp16 add convert to fp32 node and update nodeValueByName_ map,
+      // so that next operator in the model receives appropriate nodeValue.
+      NodeValue output = node->getNthResult(i);
+      if (isOpRequestedInFP16Precision(op.output(i)) &&
+          output.getElementType() == ElemKind::Float16Ty) {
+        std::string opName = op.output(i) + "_convToFP32";
+        auto *CT = G_->createConvertTo(opName, output, ElemKind::FloatTy);
+        nodeValueByName_[op.output(i)] = CT->getResult();
+      } else {
+        nodeValueByName_[op.output(i)] = NodeValue(node, i);
+      }
     }
     return Error::success();
   }
@@ -540,7 +551,14 @@ class CommonOperatorLoader : public ProtobufLoader {
 
     // LRN in Caffe2 has a scale_ output, but I believe it's unused for
     // inference. So explicitly only set output 0.
-    nodeValueByName_[op.output(0)] = N->getResult();
+    if (isOpRequestedInFP16Precision(op.output(0)) &&
+        N->getResult().getElementType() == ElemKind::Float16Ty) {
+      std::string opName = op.output(0) + "_convToFP32";
+      auto *CT = G_->createConvertTo(opName, N->getResult(), ElemKind::FloatTy);
+      nodeValueByName_[op.output(0)] = CT->getResult();
+    } else {
+      nodeValueByName_[op.output(0)] = N->getResult();
+    }
     return Error::success();
   }
 
@@ -832,7 +850,15 @@ class CommonOperatorLoader : public ProtobufLoader {
     for (int i = 0, e = op.output_size(); i < e; i++) {
       // Each output from Split is a SliceNode which only has a single output,
       // so only use 0 here as the node value result.
-      nodeValueByName_[op.output(i)] = outputs[i]->getResult();
+      if (isOpRequestedInFP16Precision(op.output(0)) &&
+          outputs[i]->getResult().getElementType() == ElemKind::Float16Ty) {
+        std::string opName = op.output(i) + "_convToFP32";
+        auto *CT = G_->createConvertTo(opName, outputs[i]->getResult(),
+                                       ElemKind::FloatTy);
+        nodeValueByName_[op.output(i)] = CT->getResult();
+      } else {
+        nodeValueByName_[op.output(i)] = outputs[i]->getResult();
+      }
     }
     return Error::success();
   }
@@ -912,7 +938,15 @@ class CommonOperatorLoader : public ProtobufLoader {
 
     // Caffe2 sometimes outputs old_shape which goes unused. We do not currently
     // support it, so explicitly only set the first output.
-    nodeValueByName_[op.output(0)] = node->getResult();
+    if (isOpRequestedInFP16Precision(op.output(0)) &&
+        node->getResult().getElementType() == ElemKind::Float16Ty) {
+      std::string opName = op.output(0) + "_convToFP32";
+      auto *CT =
+          G_->createConvertTo(opName, node->getResult(), ElemKind::FloatTy);
+      nodeValueByName_[op.output(0)] = CT->getResult();
+    } else {
+      nodeValueByName_[op.output(0)] = node->getResult();
+    }
     return Error::success();
   }
 
@@ -989,7 +1023,14 @@ class CommonOperatorLoader : public ProtobufLoader {
       }
     }
 
-    nodeValueByName_[op.output(0)] = in;
+    if (isOpRequestedInFP16Precision(op.output(0)) &&
+        in.getElementType() == ElemKind::Float16Ty) {
+      std::string opName = op.output(0) + "_convToFP32";
+      auto *CT = G_->createConvertTo(opName, in, ElemKind::FloatTy);
+      nodeValueByName_[op.output(0)] = CT->getResult();
+    } else {
+      nodeValueByName_[op.output(0)] = in;
+    }
     return Error::success();
   }
 

diff --git a/include/glow/Importer/ModelLoaderPrecisionConfiguration.h b/include/glow/Importer/ModelLoaderPrecisionConfiguration.h
@@ -0,0 +1,37 @@
+#ifndef GLOW_IMPORTER_MODELLOADERPRECISIONCONFIGURATION_H
+#define GLOW_IMPORTER_MODELLOADERPRECISIONCONFIGURATION_H
+
+#include "glow/Support/Error.h"
+
+#include "llvm/ADT/APInt.h"
+
+#include <vector>
+
+namespace glow {
+/// Holds info about mixed precision details which can be used across model
+/// loaders
+struct ModelLoaderPrecisionConfiguration {
+  /// Used during operator loading while constructing glow graph to keep the
+  /// precision of specified operator names to FP16 (i.e. quantization
+  /// conversion is skipped and FP16 conversion is done for any node kinds
+  /// found here). This creates a graph where some nodes execute in quantized
+  /// or FP32 precision and remaining in FP16 precision. If the node kind
+  /// specified via its name is unsupported by the backend in FP16 precision
+  /// it will throw an exception. Node instances indended to run in FP16 will
+  /// be in yaml file as list which can be mapped directly to a vector of
+  /// string, therefore parsing will be faster.
+  std::vector<std::string> fp16OpInstanceNames;
+};
+
+/// Sets model loader precision profile option with \p YAML fileName
+void setModelLoaderPrecisionOpt(llvm::StringRef fileName);
+
+/// Check if node precision info file is provided
+bool modelLoaderPrecisionOptEnabled();
+
+/// Deserialize Model loader precision info from the \p YAML file
+Expected<ModelLoaderPrecisionConfiguration>
+deserializeModelLoaderPrecisionInfosFromYaml();
+} // namespace glow
+
+#endif // GLOW_IMPORTER_MODELLOADERPRECISIONCONFIGURATION_H
diff --git a/include/glow/Importer/ONNXModelLoader.h b/include/glow/Importer/ONNXModelLoader.h
@@ -41,6 +41,11 @@ class TensorProto;
 
 namespace glow {
 
+/// ONNX precision config op types which requires
+/// special attention while updating precision based on opset
+/// May want to add more later
+enum class PrecisionConfigSpecialOpType { Resize, NonMaxSuppression };
+
 /// Loads tensor \p T from the input \p in. \p useGlowCustomOps changes the
 /// format for doc_string format for adding meta information.
 Error loadTensor(const ONNX_NAMESPACE::TensorProto &in, Tensor *T,
@@ -620,6 +625,12 @@ class ONNXModelLoader
                            uint32_t weightsCount,
                            const onnxTensorDescriptorV1 *weightDescriptors);
 
+  /// Check if input precision can be updated using
+  /// \ref inputsPrecisionConfigSpecialOpTypeMap_. Operator inputs mapped as
+  /// attributes while creating GLOW node should be excluded while updating
+  /// precision.
+  bool canUpdatePrecision(llvm::StringRef opType, int inputId);
+
 public:
   /// \returns ONNX model ir_version;
   size_t getIrVersion() const { return irVersion_; };
@@ -696,6 +707,15 @@ class ONNXModelLoader
   BackendSpecificNodeInfo *perNodeOpts_{nullptr};
   /// Map from static PH names to the type it was originally loaded with.
   std::map<std::string, Type> *staticPlaceholderTypes_;
+  /// Map from op type to onnx op type enum, for which few inputs mapped as
+  /// attributes while creating GLOW node, used while updating input precision
+  std::map<std::string, PrecisionConfigSpecialOpType>
+      inputsPrecisionConfigSpecialOpTypeMap_ = {
+          {"Resize", PrecisionConfigSpecialOpType::Resize},
+          {"NonMaxSuppressionV4",
+           PrecisionConfigSpecialOpType::NonMaxSuppression},
+          {"NonMaxSuppression",
+           PrecisionConfigSpecialOpType::NonMaxSuppression}};
 };
 
 } // namespace glow

diff --git a/include/glow/Importer/ProtobufLoader.h b/include/glow/Importer/ProtobufLoader.h
@@ -20,6 +20,7 @@
 #include "glow/Base/Tensor.h"
 #include "glow/ExecutionEngine/ExecutionEngine.h"
 #include "glow/Graph/Graph.h"
+#include "glow/Importer/ModelLoaderPrecisionConfiguration.h"
 #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
 #include "glow/Support/Error.h"
 
@@ -234,6 +235,11 @@ class ProtobufLoader {
   bool zeroScaleFP16Clip_{false};
   /// Whether to the range of any loaded qparams to min/max of FP16.
   bool clipQuantRangeToFP16_{false};
+  /// Model loader precision config info
+  ModelLoaderPrecisionConfiguration modelLoaderPrecisionConfig_;
+  /// A vector of operator input and output names used in updating precision
+  /// if specified via node precision info
+  std::vector<std::string> operatorPrecisionUpdateInputOutputNames_;
 
   // Delete all Constants that have no users. This is useful because some
   // Constants may have been copied and modified during loading instead of used
@@ -256,6 +262,14 @@ class ProtobufLoader {
                                bool isStatic = false, bool isTrainable = false,
                                const std::string &layout = ANY_LAYOUT);
 
+  /// \return whether input/output name \p name of an operator requested in
+  /// FP16 present in \ref operatorPrecisionUpdateInputOutputNames_
+  bool isOpRequestedInFP16Precision(llvm::StringRef name) const;
+
+  /// \returns updated nodeValuePrecision if \p name present in
+  /// \ref operatorPrecisionUpdateInputOutputNames_ by adding ConvertTo node
+  NodeValue updateNodeValuePrecision(llvm::StringRef name, NodeValue NV);
+
   /// \returns the NodeValue that was registered with the name \p name or
   /// a nullptr wrapped in a NodeValue if no node has been registered with this
   /// name. Storage NodeValues are always returned if found. Otherwise, if

diff --git a/include/glow/Optimizer/GraphOptimizer/FunctionPasses.def b/include/glow/Optimizer/GraphOptimizer/FunctionPasses.def
@@ -66,6 +66,7 @@ FUN_PASS(QuantizeSwish)
 FUN_PASS(ConvertFullyConnectedToConvolution)
 FUN_PASS(FoldMinMaxToClip)
 FUN_PASS(ReplaceZeroScaleFP16QuantNodes)
+FUN_PASS(ConvertBatchNormPrecision)
 
 
 // NOTE: This pass must be last; it's used to count the total number of passes.

diff --git a/lib/Importer/CMakeLists.txt b/lib/Importer/CMakeLists.txt
@@ -31,6 +31,7 @@ add_custom_command(
   DEPENDS ${CAFFE_HDRS})
 
 add_library(Importer
+              ModelLoaderPrecisionConfiguration.cpp
               ProtobufLoader.cpp
               Caffe2ModelLoader.cpp
               ONNXModelLoader.cpp

diff --git a/lib/Importer/ModelLoaderPrecisionConfiguration.cpp b/lib/Importer/ModelLoaderPrecisionConfiguration.cpp
@@ -0,0 +1,76 @@
+#include "glow/Importer/ModelLoaderPrecisionConfiguration.h"
+#include "glow/Support/Support.h"
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+namespace yaml {
+
+/// Mapping for ModelLoaderPrecisionConfiguration yaml serializer.
+template <> struct MappingTraits<glow::ModelLoaderPrecisionConfiguration> {
+  static void mapping(IO &io, glow::ModelLoaderPrecisionConfiguration &info) {
+    io.mapRequired("FP16NodeInstanceNames", info.fp16OpInstanceNames);
+  }
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+namespace glow {
+
+llvm::cl::OptionCategory loaderPrecisionCat("ModelLoader Precision Options");
+
+llvm::cl::opt<std::string> loadModelLoaderPrecisionFileOpt(
+    "node-precision-info",
+    llvm::cl::desc("Load model loader precision file which contains\n"
+                   "instances output names to be executed in FP16\n"
+                   "Currently supported only for ONNX models"),
+    llvm::cl::value_desc("precision_info.yaml"),
+    llvm::cl::cat(loaderPrecisionCat));
+
+void setModelLoaderPrecisionOpt(llvm::StringRef fileName) {
+  loadModelLoaderPrecisionFileOpt = fileName;
+}
+
+bool modelLoaderPrecisionOptEnabled() {
+  if (loadModelLoaderPrecisionFileOpt.empty()) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+Expected<ModelLoaderPrecisionConfiguration>
+deserializeModelLoaderPrecisionInfosFromYaml() {
+  ModelLoaderPrecisionConfiguration modelLoaderPrecsionConfig;
+
+  llvm::StringRef fileName = loadModelLoaderPrecisionFileOpt;
+
+  RETURN_ERR_IF_NOT(llvm::sys::fs::exists(fileName),
+                    "Could not find file with name: " + fileName.str());
+
+  // Open YAML input stream.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> text =
+      llvm::MemoryBuffer::getFileAsStream(fileName);
+
+  RETURN_ERR_IF_NOT(!text.getError(),
+                    "Unable to open file with name: " + fileName.str());
+
+  std::unique_ptr<llvm::MemoryBuffer> buffer = std::move(*text);
+  llvm::yaml::Input yin(buffer->getBuffer());
+
+  // Error message in case of incorrect precision info format.
+  std::string ErrMsg =
+      strFormat("Error reading YAML file '%s'!", fileName.data());
+
+  // Read profiling info.
+  yin >> modelLoaderPrecsionConfig;
+  RETURN_ERR_IF_NOT(!yin.error(), ErrMsg);
+  return modelLoaderPrecsionConfig;
+}
+
+} // namespace glow