pytorch · omromano · Apr 27, 2020
diff --git a/lib/Backends/NNPI/Importer.cpp b/lib/Backends/NNPI/Importer.cpp
@@ -1430,74 +1430,12 @@ class RQFCNodeImporter : public INNPINodeImporter {
     auto *glowRowwiseFC = llvm::dyn_cast<RowwiseQuantizedFullyConnectedNode>(n);
     LOG_AND_RETURN_IF_NOT(ERROR, glowRowwiseFC, "Bad node type",
                           NNPI_INVALID_PARAM);
-    LOG_AND_RETURN_IF_NOT(
-        ERROR, glowRowwiseFC->getInput().getType()->getOffset() == 0.f,
-        (std::string("Bad input offset value") +
-         std::to_string(glowRowwiseFC->getInput().getType()->getOffset())),
-        NNPI_INVALID_PARAM);
-    LOG_AND_RETURN_IF_NOT(
-        ERROR, glowRowwiseFC->getResult().getType()->getOffset() == 0.f,
-        (std::string("Bad result offset value") +
-         std::to_string(glowRowwiseFC->getResult().getType()->getOffset())),
-        NNPI_INVALID_PARAM);
     LOG_AND_RETURN_IF_NOT(
         ERROR,
         !(glowRowwiseFC->getOffsets()) ||
             importer.zeroes(nodeValueName(glowRowwiseFC->getOffsets()).c_str()),
         "Bad offset value", NNPI_INVALID_PARAM);
 
-    // Add internal tensor for Symlowp input.
-    std::string symlowpInputName =
-        NNPIImporter::internalName_ +
-        nodeValueName(glowRowwiseFC->getInput()).c_str() + "_symlowp";
-    auto *inType = glowRowwiseFC->getInput().getType();
-    LOG_NNPI_IF_ERROR_RETURN_VALUE(
-        importer.addValue(symlowpInputName, inType,
-                          /* alternativeLayout */ inType->dims().size() == 4,
-                          /* input */ false, /* output */ false, {}, {},
-                          /* forceSymlowp */ true),
-        "Failed to add value");
-
-    // Add internal tensor for Symlowp output.
-    std::string symlowpOutputName =
-        NNPIImporter::internalName_ +
-        nodeValueName(glowRowwiseFC->getResult()).c_str() + "_symlowp";
-    auto *outType = glowRowwiseFC->getResult().getType();
-    LOG_NNPI_IF_ERROR_RETURN_VALUE(
-        importer.addValue(symlowpOutputName, outType,
-                          /* alternativeLayout */ outType->dims().size() == 4,
-                          /* input */ false, /* output */ false, {}, {},
-                          /* forceSymlowp */ true),
-        "Failed to add value");
-
-    // Add convert op from Gemmlowp input to Symlowp.
-    std::string convertInputName = NNPIImporter::internalName_ +
-                                   glowRowwiseFC->getName().begin() +
-                                   "_convert_input";
-    std::string convertInputInputName =
-        nodeValueName(glowRowwiseFC->getInput());
-    if (!importer.hasChannelWiseConverter(convertInputInputName)) {
-      LOG_NNPI_IF_ERROR_RETURN_VALUE(
-          nnpiNetworkAddConvertOp(
-              importer.getNetwork(), convertInputName.c_str(),
-              convertInputInputName.c_str(), symlowpInputName.c_str()),
-          "Failed to add layer");
-      importer.addChannelWiseConverter(convertInputInputName);
-    }
-
-    // Add convert op from Symlowp output to Gemmlowp.
-    std::string convertOutputName = NNPIImporter::internalName_ +
-                                    glowRowwiseFC->getName().begin() +
-                                    "_convert_output";
-    std::string convertOutputOutputName =
-        nodeValueName(glowRowwiseFC->getResult());
-    LOG_NNPI_IF_ERROR_RETURN_VALUE(
-        nnpiNetworkAddConvertOp(
-            importer.getNetwork(), convertOutputName.c_str(),
-            symlowpOutputName.c_str(), convertOutputOutputName.c_str()),
-        "Failed to add layer");
-    importer.addChannelWiseConverter(convertOutputOutputName);
-
     // Create the weights with no offset tensor.
     // Assert weights & biases have no offset or all zeroes.
 
@@ -1534,17 +1472,14 @@ class RQFCNodeImporter : public INNPINodeImporter {
             nodeValueName(glowRowwiseFC->getInput()),
             nodeValueName(glowRowwiseFC->getWeights()),
             nodeValueName(glowRowwiseFC->getBias()),
-            symlowpInputName,
-            symlowpOutputName,
         },
         {
             nodeValueName(glowRowwiseFC->getResult()),
-            symlowpInputName,
-            symlowpOutputName,
         });
     return nnpiNetworkAddFullyConnectedOp(
         importer.getNetwork(), glowRowwiseFC->getName().begin(),
-        symlowpInputName.c_str(), symlowpOutputName.c_str(),
+        nodeValueName(glowRowwiseFC->getInput()).c_str(),
+        nodeValueName(glowRowwiseFC->getResult()).c_str(),
         nodeValueName(glowRowwiseFC->getWeights()).c_str(),
         glowRowwiseFC->getBias()
             ? nodeValueName(glowRowwiseFC->getBias()).c_str()
@@ -1560,7 +1495,6 @@ class ChannelwiseQuantizedConvolutionNodeImporter : public INNPINodeImporter {
         llvm::dyn_cast<ChannelwiseQuantizedConvolutionNode>(n);
     LOG_AND_RETURN_IF_NOT(ERROR, glowChannelwiseQuantizedConv, "Bad node type",
                           NNPI_INVALID_PARAM);
-
     LOG_AND_RETURN_IF_NOT(
         ERROR,
         !(glowChannelwiseQuantizedConv->getOffsets()) ||
@@ -1597,60 +1531,6 @@ class ChannelwiseQuantizedConvolutionNodeImporter : public INNPINodeImporter {
         glowChannelwiseQuantizedConv->getStrides()[1]};
     uint32_t dilation[SPATIAL_DIMS2] = {1, 1}; // No dilation, default values
 
-    // Add internal tensor for Symlowp input.
-    std::string symlowpInputName =
-        NNPIImporter::internalName_ +
-        nodeValueName(glowChannelwiseQuantizedConv->getInput()).c_str() +
-        "_symlowp";
-    auto *inType = glowChannelwiseQuantizedConv->getInput().getType();
-    LOG_NNPI_IF_ERROR_RETURN_VALUE(
-        importer.addValue(symlowpInputName, inType,
-                          /* alternativeLayout */ inType->dims().size() == 4,
-                          /* input */ false, /* output */ false, {}, {},
-                          /* forceSymlowp */ true),
-        "Failed to add value");
-
-    // Add internal tensor for Symlowp output.
-    std::string symlowpOutputName =
-        NNPIImporter::internalName_ +
-        nodeValueName(glowChannelwiseQuantizedConv->getResult()).c_str() +
-        "_symlowp";
-    auto *outType = glowChannelwiseQuantizedConv->getResult().getType();
-    LOG_NNPI_IF_ERROR_RETURN_VALUE(
-        importer.addValue(symlowpOutputName, outType,
-                          /* alternativeLayout */ outType->dims().size() == 4,
-                          /* input */ false, /* output */ false, {}, {},
-                          /* forceSymlowp */ true),
-        "Failed to add value");
-
-    // Add convert op from Gemmlowp input to Symlowp.
-    std::string convertInputName =
-        NNPIImporter::internalName_ +
-        glowChannelwiseQuantizedConv->getName().begin() + "_convert_input";
-    std::string convertInputInputName =
-        nodeValueName(glowChannelwiseQuantizedConv->getInput());
-    if (!importer.hasChannelWiseConverter(convertInputInputName)) {
-      LOG_NNPI_IF_ERROR_RETURN_VALUE(
-          nnpiNetworkAddConvertOp(
-              importer.getNetwork(), convertInputName.c_str(),
-              convertInputInputName.c_str(), symlowpInputName.c_str()),
-          "Failed to add layer");
-      importer.addChannelWiseConverter(convertInputInputName);
-    }
-
-    // Add convert op from Symlowp output to Gemmlowp.
-    std::string convertOutputName =
-        NNPIImporter::internalName_ +
-        glowChannelwiseQuantizedConv->getName().begin() + "_convert_output";
-    std::string convertOutputOutputName =
-        nodeValueName(glowChannelwiseQuantizedConv->getResult());
-    LOG_NNPI_IF_ERROR_RETURN_VALUE(
-        nnpiNetworkAddConvertOp(
-            importer.getNetwork(), convertOutputName.c_str(),
-            symlowpOutputName.c_str(), convertOutputOutputName.c_str()),
-        "Failed to add layer");
-    importer.addChannelWiseConverter(convertOutputOutputName);
-
     // Create the weights with no offset tensor.
     // Assert weights & biases have no offset or all zeroes.
 
@@ -1694,18 +1574,15 @@ class ChannelwiseQuantizedConvolutionNodeImporter : public INNPINodeImporter {
             nodeValueName(glowChannelwiseQuantizedConv->getInput()),
             nodeValueName(glowChannelwiseQuantizedConv->getFilter()),
             nodeValueName(glowChannelwiseQuantizedConv->getBias()),
-            symlowpInputName,
-            symlowpOutputName,
         },
         {
             nodeValueName(glowChannelwiseQuantizedConv->getResult()),
-            symlowpInputName,
-            symlowpOutputName,
         });
 
     return nnpiNetworkAddConvolutionOp(
         importer.getNetwork(), glowChannelwiseQuantizedConv->getName().begin(),
-        symlowpInputName.c_str(), symlowpOutputName.c_str(),
+        nodeValueName(glowChannelwiseQuantizedConv->getInput()).c_str(),
+        nodeValueName(glowChannelwiseQuantizedConv->getResult()).c_str(),
         nodeValueName(glowChannelwiseQuantizedConv->getFilter()).c_str(),
         glowChannelwiseQuantizedConv->getBias()
             ? nodeValueName(glowChannelwiseQuantizedConv->getBias()).c_str()

diff --git a/lib/Backends/NNPI/InferenceContext.cpp b/lib/Backends/NNPI/InferenceContext.cpp
diff --git a/lib/Backends/NNPI/InferencePool.cpp b/lib/Backends/NNPI/InferencePool.cpp
@@ -28,7 +28,7 @@ namespace glow {
 namespace runtime {
 
 InferencePoolEnv::InferencePoolEnv()
-    : numWorkers_(0), deviceOptions_(nullptr), nnpiCompiledFunction_(nullptr),
+    : deviceOptions_(nullptr), nnpiCompiledFunction_(nullptr),
       staticPlaceholderMap_(nullptr) {}
 
 InferencePoolEnv::~InferencePoolEnv() {
@@ -41,8 +41,7 @@ InferencePoolEnv::~InferencePoolEnv() {
   }
 }
 
-Error InferencePoolEnv::init(unsigned numWorkers, NNPIAdapter adapter,
-                             NNPIDeviceContext device,
+Error InferencePoolEnv::init(NNPIAdapter adapter, NNPIDeviceContext device,
                              std::shared_ptr<NNPIDeviceTracing> deviceTracing,
                              CompiledFunction *compiledFunction,
                              StaticPlaceholderMap *staticPlaceholderMap,
@@ -57,20 +56,24 @@ Error InferencePoolEnv::init(unsigned numWorkers, NNPIAdapter adapter,
   if (workersPool_) {
     return MAKE_ERR("InferencePool already initialized!");
   }
-  numWorkers_ = numWorkers;
+
+  nnpiCompiledFunction_ = static_cast<NNPICompiledFunction *>(compiledFunction);
+  size_t optionsNumWorkers =
+      nnpiCompiledFunction_->getCompilationOptions().numWorkers;
+  // Ice-ref not re-entrant for the same nnpiNetwork.
+  size_t numWorkers = deviceOptions_->inferOnDevice ? optionsNumWorkers : 1;
   workersPool_ = glow::make_unique<folly::CPUThreadPoolExecutor>(
-      numWorkers_, std::make_shared<folly::NamedThreadFactory>("NNPI-worker"));
+      numWorkers, std::make_shared<folly::NamedThreadFactory>("NNPI-worker"));
   deviceTracing_ = deviceTracing;
   staticPlaceholderMap_ = staticPlaceholderMap;
 
-  inferenceContexts_.resize(numWorkers_);
-  freeContexts_.resize(numWorkers_);
-  if (inferenceContexts_.size() != numWorkers_) {
+  inferenceContexts_.resize(numWorkers);
+  freeContexts_.resize(numWorkers);
+  if (inferenceContexts_.size() != numWorkers) {
     return MAKE_ERR("InferencePool failed to create inference contexts");
   }
 
   // Create host network.
-  nnpiCompiledFunction_ = static_cast<NNPICompiledFunction *>(compiledFunction);
   NNPIHostNetwork hostNetwork(NNPI_INVALID_NNPIHANDLE);
   if (deviceOptions_->inferOnDevice) {
     // Create NNPI host network (load compiled binary).

diff --git a/lib/Backends/NNPI/InferencePool.h b/lib/Backends/NNPI/InferencePool.h
@@ -33,7 +33,6 @@ namespace glow {
 namespace runtime {
 class NNPIDeviceBindings;
 class InferencePoolEnv {
-  unsigned numWorkers_;
   std::unique_ptr<folly::CPUThreadPoolExecutor> workersPool_;
   std::vector<InferenceContext> inferenceContexts_;
   std::vector<InferenceContext *> freeContexts_;
@@ -53,7 +52,7 @@ class InferencePoolEnv {
 public:
   InferencePoolEnv();
   ~InferencePoolEnv();
-  Error init(unsigned numWorkers, NNPIAdapter adapter, NNPIDeviceContext device,
+  Error init(NNPIAdapter adapter, NNPIDeviceContext device,
              std::shared_ptr<NNPIDeviceTracing> deviceTracing,
              CompiledFunction *compiledFunction,
              StaticPlaceholderMap *staticPlaceholderMap,

diff --git a/lib/Backends/NNPI/NNPI.cpp b/lib/Backends/NNPI/NNPI.cpp
@@ -198,7 +198,8 @@ bool NNPIBackend::isOpSupported(const NodeInfo &NI) const {
     }
     return NI.allInputsAndOutputsHaveSameElemKind({ElemKind::Int8QTy},
                                                   {ConvolutionNode::BiasIdx}) &&
-           (NI.getInElemTy(ConvolutionNode::BiasIdx) == ElemKind::Int32QTy);
+           ((NI.getInElemTy(ConvolutionNode::BiasIdx) == ElemKind::Int32QTy) ||
+            (NI.getInElemTy(ConvolutionNode::BiasIdx) == ElemKind::FloatTy));
 
   case Kinded::Kind::Convolution3DNodeKind:
     if (!NI.getInTy(Convolution3DNode::InputIdx)->isQuantizedType()) {
@@ -207,7 +208,9 @@ bool NNPIBackend::isOpSupported(const NodeInfo &NI) const {
     }
     return NI.allInputsAndOutputsHaveSameElemKind(
                {ElemKind::Int8QTy}, {Convolution3DNode::BiasIdx}) &&
-           (NI.getInElemTy(Convolution3DNode::BiasIdx) == ElemKind::Int32QTy);
+           ((NI.getInElemTy(Convolution3DNode::BiasIdx) ==
+             ElemKind::Int32QTy) ||
+            (NI.getInElemTy(ConvolutionNode::BiasIdx) == ElemKind::FloatTy));
   case Kinded::Kind::QuantizeNodeKind:
     return (NI.getInElemTy(QuantizeNode::InputIdx) == ElemKind::FloatTy ||
             NI.getInElemTy(QuantizeNode::InputIdx) == ElemKind::Float16Ty) &&
@@ -238,13 +241,15 @@ bool NNPIBackend::isOpSupported(const NodeInfo &NI) const {
   }
 
   case Kinded::Kind::FullyConnectedNodeKind:
-    if (!NI.getInTy(ConvolutionNode::InputIdx)->isQuantizedType()) {
+    if (!NI.getInTy(FullyConnectedNode::InputIdx)->isQuantizedType()) {
       return NI.allInputsAndOutputsHaveSameElemKind(
           {ElemKind::FloatTy, ElemKind::Float16Ty});
     }
     return NI.allInputsAndOutputsHaveSameElemKind(
                {ElemKind::Int8QTy}, {FullyConnectedNode::BiasIdx}) &&
-           (NI.getInElemTy(FullyConnectedNode::BiasIdx) == ElemKind::Int32QTy);
+           ((NI.getInElemTy(FullyConnectedNode::BiasIdx) ==
+             ElemKind::Int32QTy) ||
+            (NI.getInElemTy(FullyConnectedNode::BiasIdx) == ElemKind::FloatTy));
 
   case Kinded::Kind::MaxPoolNodeKind:
     return NI.allInputsAndOutputsHaveSameElemKind(
@@ -309,8 +314,10 @@ bool NNPIBackend::isOpSupported(const NodeInfo &NI) const {
             ElemKind::FloatTy) &&
            (NI.getInElemTy(RowwiseQuantizedFullyConnectedNode::OffsetsIdx) ==
             ElemKind::Int32ITy) &&
-           (NI.getInElemTy(RowwiseQuantizedFullyConnectedNode::BiasIdx) ==
-            ElemKind::Int32QTy) &&
+           ((NI.getInElemTy(RowwiseQuantizedFullyConnectedNode::BiasIdx) ==
+             ElemKind::Int32QTy) ||
+            (NI.getInElemTy(RowwiseQuantizedFullyConnectedNode::BiasIdx) ==
+             ElemKind::FloatTy)) &&
            (NI.getOutElemTy(RowwiseQuantizedFullyConnectedNode::ResultIdx) ==
             ElemKind::Int8QTy);
 
@@ -370,7 +377,8 @@ bool NNPIBackend::isOpSupported(const NodeInfo &NI) const {
     auto resultK =
         NI.getOutElemTy(EmbeddingBagByteRowwiseOffsetsNode::ResultIdx);
     return (dataK == ElemKind::UInt8FusedQTy ||
-            dataK == ElemKind::UInt8FusedFP16QTy) &&
+            dataK == ElemKind::UInt8FusedFP16QTy ||
+            dataK == ElemKind::UInt4FusedFP16QTy) &&
            (resultK == ElemKind::FloatTy || resultK == ElemKind::Float16Ty) &&
            (indicesK == ElemKind::Int64ITy) && (offsetsK == ElemKind::Int64ITy);
   }
@@ -500,6 +508,7 @@ bool NNPIBackend::shouldLower(const Node *N) const {
   case Kinded::Kind::AdaptiveAvgPoolNodeKind:
   case Kinded::Kind::EmbeddingBagNodeKind:
   case Kinded::Kind::EmbeddingBagByteRowwiseOffsetsNodeKind:
+  case Kinded::Kind::LayerNormalizationNodeKind:
     return false;
   case Kinded::Kind::FusedRowwiseQuantizedSparseLengthsSumNodeKind: {
     const FusedRowwiseQuantizedSparseLengthsSumNode *SLSN =
@@ -510,7 +519,6 @@ bool NNPIBackend::shouldLower(const Node *N) const {
       return true;
     }
   }
-  case Kinded::Kind::LayerNormalizationNodeKind:
   case Kinded::Kind::SparseLengthsSumNodeKind:
     // WA - lower until ICE-T implements it.
     if (NNPIBackend::backendOptions_.useIceT ||
@@ -1133,8 +1141,6 @@ traversePostOrder(const runtime::DAGNode *root,
 Error NNPIBackend::bindContexts(
     llvm::ArrayRef<runtime::ContextBinding> bindings,
     const runtime::DAGNode *root, bool enableP2P, bool enableDRT) {
-  LOG(INFO) << "enableP2P/DRT not yet implemented. enableDRT = " << enableDRT
-            << ", enableP2P = " << enableP2P << ".\n";
   if (backendOptions_.dumpRuntime) {
     DotWriter::clear();
     DotWriter::addSubGraph("Host", "Host");
@@ -1154,10 +1160,12 @@ Error NNPIBackend::bindContexts(
     nnpiDM->addPlaceholderUsageCount(cb.networkName, phUsage);
   }
 
-  for (const auto &usage : phUsage) {
+  for (auto &usage : phUsage) {
     LOG_IF_NOT_RETURN_LLVMERROR(
         usage.second.numWriters < 2,
         "Multiple writes to the same placeholder not suported");
+    usage.second.disableP2P = !enableP2P;
+    usage.second.disableDRT = !enableDRT;
   }
 
   for (auto *dagNode : postOrder) {

diff --git a/lib/Backends/NNPI/NNPICompiledFunction.cpp b/lib/Backends/NNPI/NNPICompiledFunction.cpp
@@ -362,16 +362,16 @@ Error NNPICompiledFunction::compile(Function *F, const BackendOptions &opts) {
                                    compilationFileName_.c_str(), NULL),
           "Failed NNPI Compile");
     }
-  }
-  if (compilationOptions_.inferOnDevice) {
-    DBG_MEM_USAGE("NNPICompiledFunction destroy network");
-    // NNPINetwork is not needed anymore on the inferfence api path.
-    // Once the complied stream is loaded, query on the network can be done
-    // using the host network instead.
-    LOG_NNPI_IF_ERROR(nnpiNetworkDestroy(network_),
-                      "Failed NNPI Network Destroy");
-    network_ = NNPI_INVALID_NNPIHANDLE;
-    DBG_MEM_USAGE("NNPICompiledFunction destroy network done");
+    if (compilationOptions_.inferOnDevice) {
+      DBG_MEM_USAGE("NNPICompiledFunction destroy network");
+      // NNPINetwork is not needed anymore on the inferfence api path.
+      // Once the complied stream is loaded, query on the network can be done
+      // using the host network instead.
+      LOG_NNPI_IF_ERROR(nnpiNetworkDestroy(network_),
+                        "Failed NNPI Network Destroy");
+      network_ = NNPI_INVALID_NNPIHANDLE;
+      DBG_MEM_USAGE("NNPICompiledFunction destroy network done");
+    }
   }
 
   // Determine and save what inputs can be treated as partial. Need to do this
@@ -390,6 +390,12 @@ Error NNPICompiledFunction::compile(Function *F, const BackendOptions &opts) {
   return Error::success();
 }
 
+NNPICompiledFunction::NNPICompiledFunction(Function *F)
+    : CompiledFunction(runtime::RuntimeBundle::create(*F)),
+      compilationOptions_({}) {
+  std::memset(&config_, 0, sizeof(config_));
+};
+
 NNPICompiledFunction::~NNPICompiledFunction() {
   if (network_ != NNPI_INVALID_NNPIHANDLE) {
     LOG_NNPI_IF_ERROR(nnpiNetworkDestroy(network_),