diff --git a/lib/Backends/NNPI/CMakeLists.txt b/lib/Backends/NNPI/CMakeLists.txt
index b57cace4c3..95c5297e31 100644
--- a/lib/Backends/NNPI/CMakeLists.txt
+++ b/lib/Backends/NNPI/CMakeLists.txt
@@ -83,14 +83,14 @@ if(NOT NNPI_INFERENCE_API)
   message(FATAL_ERROR "nnpi_inference include files  not found at ${NNPI_INF_LIB_DIR}")
 endif()
 
-find_path(NNPI_MG_API nnpiml.h ${NNPI_MG_SEARCH_PATH})
+find_path(NNPI_MG_API nnpi_ice_caps.h ${NNPI_MG_SEARCH_PATH})
 if(NOT NNPI_MG_API)
-  message(FATAL_ERROR "nnpiml include files not found at ${NNPI_MG_API_DIR}")
+  message(FATAL_ERROR "nnpi_ice_caps include files not found at ${NNPI_MG_API_DIR}")
 endif()
 
-find_library(NNPI_MG_LIB nnpiml ${NNPI_MG_LIB_SEARCH_PATH})
+find_library(NNPI_MG_LIB nnpi_icecaps ${NNPI_MG_LIB_SEARCH_PATH})
 if(NOT NNPI_MG_LIB)
-  message(FATAL_ERROR "nnpiml library not found at ${NNPI_MG_LIB_SEARCH_PATH}")
+  message(FATAL_ERROR "nnpi_icecaps library not found at ${NNPI_MG_LIB_SEARCH_PATH}")
 endif()
 
 message(STATUS "[NNPI] NNPI_API_DIR                = ${NNPI_API_DIR}")
diff --git a/lib/Backends/NNPI/ClassGen/NNPISpecificNodes.h b/lib/Backends/NNPI/ClassGen/NNPISpecificNodes.h
index 1ec7c80421..39638f3a6c 100644
--- a/lib/Backends/NNPI/ClassGen/NNPISpecificNodes.h
+++ b/lib/Backends/NNPI/ClassGen/NNPISpecificNodes.h
@@ -26,6 +26,13 @@ BB.newNode("NNPICustomDSP")
     .setDocstring("This is an experimental NNPI-specific node representing a "
                   "custom DSP op");
 
-BB.includeBackendSpecificVerification("glow/NNPISpecificNodesVerification.h");
+BB.newNode("NNPICustomIA")
+    .addMember(MemberType::VectorNodeValue, "Inputs")
+    .addResultFromCtorArg() // for now use single output
+    .addMember(MemberType::String, "KernelName")
+    .addMember(MemberType::String, "IAPath")
+    .setDocstring("This is an experimental NNPI-specific node representing a "
+                  "custom IA op");
 
+BB.includeBackendSpecificVerification("glow/NNPISpecificNodesVerification.h");
 #endif // GLOW_WITH_NNPI
diff --git a/lib/Backends/NNPI/ClassGen/NNPISpecificNodesVerification.h b/lib/Backends/NNPI/ClassGen/NNPISpecificNodesVerification.h
index 31c6442c29..97cf81d1a1 100644
--- a/lib/Backends/NNPI/ClassGen/NNPISpecificNodesVerification.h
+++ b/lib/Backends/NNPI/ClassGen/NNPISpecificNodesVerification.h
@@ -19,4 +19,8 @@ bool NNPICustomDSPNode::verify() const {
   return true; // actual verification to happen in the backend
 }
 
+bool NNPICustomIANode::verify() const {
+  return true; // actual verification to happen in the backend
+}
+
 #endif // GLOW_WITH_NNPI
diff --git a/lib/Backends/NNPI/DebugMacros.h b/lib/Backends/NNPI/DebugMacros.h
index 9f6988dc36..83c97b1dfe 100644
--- a/lib/Backends/NNPI/DebugMacros.h
+++ b/lib/Backends/NNPI/DebugMacros.h
@@ -21,6 +21,7 @@
 #include "nnpi_transformer.h"
 #include <chrono>
 #include <glog/logging.h>
+#include <sstream>
 #include <string>
 
 // Macro for memory instrumentation.
@@ -273,4 +274,14 @@ GetNNPIInferenceErrorDesc(NNPIInferenceErrorCode err) {
                "\n";                                                           \
   }
 
+// Break long log messages to individual lines (Glog limits to 30k chars).
+#define LONG_LOG(level, msg)                                                   \
+  {                                                                            \
+    std::istringstream iss(msg);                                               \
+    std::string line;                                                          \
+    while (std::getline(iss, line)) {                                          \
+      LOG(level) << line;                                                      \
+    }                                                                          \
+  }
+
 #endif // GLOW_NNPI_DEBUG_MACROS_H
diff --git a/lib/Backends/NNPI/Importer.cpp b/lib/Backends/NNPI/Importer.cpp
index 6b4578d30b..c5b45475bd 100644
--- a/lib/Backends/NNPI/Importer.cpp
+++ b/lib/Backends/NNPI/Importer.cpp
@@ -23,6 +23,7 @@
 #include "nnpi_transformer.h"
 #include <cmath>
 #include <cstdio>
+#include <fstream>
 #include <limits>
 
 using namespace glow;
@@ -215,7 +216,7 @@ void glow::NNPIImporter::updateDescDimsFromGlow(
     desc.layout = NNPI_LAYOUT_ANY;
     break;
   case 5:
-    desc.layout = NNPI_LAYOUT_ANY;
+    desc.layout = alternativeLayout ? NNPI_LAYOUT_NDHWC : NNPI_LAYOUT_ANY;
     break;
   case 4:
     desc.layout = alternativeLayout ? NNPI_LAYOUT_NHWC : NNPI_LAYOUT_ANY;
@@ -403,6 +404,7 @@ bool glow::NNPIImporter::isVariableUsingAlternativeLayout(Storage *v) {
   for (const auto &user : v->getUsers()) {
     switch (user.getUser()->getKind()) {
     case Kinded::Kind::ConvolutionNodeKind:
+    case Kinded::Kind::Convolution3DNodeKind:
     case Kinded::Kind::AvgPoolNodeKind:
     case Kinded::Kind::MaxPoolNodeKind:
       return true;
@@ -415,6 +417,17 @@ bool glow::NNPIImporter::isVariableUsingAlternativeLayout(Storage *v) {
   return false;
 }
 
+NNPIErrorCode
+glow::NNPIImporter::addIAExtentionPath(const std::string &extPath) {
+  LOG_AND_RETURN_IF(ERROR, extPath.empty(), "Check if empty IA extension path.",
+                    NNPI_INVALID_PARAM);
+  std::ifstream extensionFile(extPath.c_str());
+  LOG_AND_RETURN_IF_NOT(ERROR, extensionFile, "IA extension path not found.",
+                        NNPI_INVALID_RESOURCE_NAME);
+  iaExtensionPaths_.push_back(extPath);
+  return NNPI_NO_ERROR;
+}
+
 NNPINetwork glow::NNPIImporter::importFunction(Function *F,
                                                const BackendOptions &opts) {
   // Clear internals.
@@ -507,33 +520,45 @@ NNPINetwork glow::NNPIImporter::importFunction(Function *F,
 }
 
 // Node Importers ////////////////////////////////////////////////////////
+template <class ConvType = ConvolutionNode, size_t convDims = 2>
 class ConvolutionNodeImporter : public INNPINodeImporter {
 public:
   NNPIErrorCode importNode(Node *n, NNPIImporter &importer) override {
-    auto *glowConv = llvm::dyn_cast<ConvolutionNode>(n);
+    auto *glowConv = llvm::dyn_cast<ConvType>(n);
+
+    std::string convStr = (convDims == 2) ? "Conv" : "Conv3D";
     LOG_AND_RETURN_IF_NOT(ERROR, glowConv, "Bad node type", NNPI_INVALID_PARAM);
 
-    const uint32_t SPATIAL_DIMS2 = 2;
-    LOG_AND_RETURN_IF_NOT(ERROR, glowConv->getKernels().size() == SPATIAL_DIMS2,
-                          "[Conv] Invalid number of kernel sizes",
+    LOG_AND_RETURN_IF_NOT(ERROR, glowConv->getKernels().size() == convDims,
+                          "[" + convStr + "] Invalid number of kernel sizes",
                           NNPI_INVALID_PARAM);
-    LOG_AND_RETURN_IF_NOT(ERROR,
-                          glowConv->getPads().size() == 2 * SPATIAL_DIMS2,
-                          "[Conv] Invalid number of pads", NNPI_INVALID_PARAM);
-    LOG_AND_RETURN_IF_NOT(ERROR, glowConv->getStrides().size() == SPATIAL_DIMS2,
-                          "[Conv] Invalid number of strides",
+    LOG_AND_RETURN_IF_NOT(ERROR, glowConv->getPads().size() == 2 * convDims,
+                          "[" + convStr + "] Invalid number of pads",
+                          NNPI_INVALID_PARAM);
+    LOG_AND_RETURN_IF_NOT(ERROR, glowConv->getStrides().size() == convDims,
+                          "[" + convStr + "] Invalid number of strides",
                           NNPI_INVALID_PARAM);
 
-    uint32_t kernel[SPATIAL_DIMS2] = {glowConv->getKernels()[0],
-                                      glowConv->getKernels()[1]};
-    uint32_t paddingStart[SPATIAL_DIMS2] = {glowConv->getPads()[0],
-                                            glowConv->getPads()[1]};
-    uint32_t paddingEnd[SPATIAL_DIMS2] = {glowConv->getPads()[2],
-                                          glowConv->getPads()[3]};
-    uint32_t stride[SPATIAL_DIMS2] = {glowConv->getStrides()[0],
-                                      glowConv->getStrides()[1]};
-    uint32_t dilation[SPATIAL_DIMS2] = {glowConv->getDilation(),
-                                        glowConv->getDilation()};
+    uint32_t kernel[convDims];
+    uint32_t paddingStart[convDims];
+    uint32_t paddingEnd[convDims];
+    uint32_t stride[convDims];
+    uint32_t dilation[convDims];
+
+    ConvolutionNode *conv2DNode = llvm::dyn_cast<ConvolutionNode>(glowConv);
+    for (size_t i = 0; i < convDims; i++) {
+      kernel[i] = glowConv->getKernels()[i];
+      stride[i] = glowConv->getStrides()[i];
+      if (conv2DNode) {
+        paddingStart[i] = glowConv->getPads()[i];
+        paddingEnd[i] = glowConv->getPads()[convDims + i];
+        dilation[i] = conv2DNode->getDilation();
+      } else {
+        paddingStart[i] = glowConv->getPads()[i * 2];
+        paddingEnd[i] = glowConv->getPads()[i * 2 + 1];
+        dilation[i] = 1;
+      }
+    }
 
     LOG_NNPI_IF_ERROR_RETURN_VALUE(
         importer.addTensor(nodeValueName(glowConv->getFilter()),
@@ -567,7 +592,7 @@ class ConvolutionNodeImporter : public INNPINodeImporter {
         nodeValueName(glowConv->getFilter()).c_str(),
         glowConv->getBias() ? nodeValueName(glowConv->getBias()).c_str()
                             : nullptr,
-        kernel, paddingStart, paddingEnd, stride, dilation, SPATIAL_DIMS2,
+        kernel, paddingStart, paddingEnd, stride, dilation, convDims,
         glowConv->getGroup());
   }
 };
@@ -1430,74 +1455,12 @@ class RQFCNodeImporter : public INNPINodeImporter {
     auto *glowRowwiseFC = llvm::dyn_cast<RowwiseQuantizedFullyConnectedNode>(n);
     LOG_AND_RETURN_IF_NOT(ERROR, glowRowwiseFC, "Bad node type",
                           NNPI_INVALID_PARAM);
-    LOG_AND_RETURN_IF_NOT(
-        ERROR, glowRowwiseFC->getInput().getType()->getOffset() == 0.f,
-        (std::string("Bad input offset value") +
-         std::to_string(glowRowwiseFC->getInput().getType()->getOffset())),
-        NNPI_INVALID_PARAM);
-    LOG_AND_RETURN_IF_NOT(
-        ERROR, glowRowwiseFC->getResult().getType()->getOffset() == 0.f,
-        (std::string("Bad result offset value") +
-         std::to_string(glowRowwiseFC->getResult().getType()->getOffset())),
-        NNPI_INVALID_PARAM);
     LOG_AND_RETURN_IF_NOT(
         ERROR,
         !(glowRowwiseFC->getOffsets()) ||
             importer.zeroes(nodeValueName(glowRowwiseFC->getOffsets()).c_str()),
         "Bad offset value", NNPI_INVALID_PARAM);
 
-    // Add internal tensor for Symlowp input.
-    std::string symlowpInputName =
-        NNPIImporter::internalName_ +
-        nodeValueName(glowRowwiseFC->getInput()).c_str() + "_symlowp";
-    auto *inType = glowRowwiseFC->getInput().getType();
-    LOG_NNPI_IF_ERROR_RETURN_VALUE(
-        importer.addValue(symlowpInputName, inType,
-                          /* alternativeLayout */ inType->dims().size() == 4,
-                          /* input */ false, /* output */ false, {}, {},
-                          /* forceSymlowp */ true),
-        "Failed to add value");
-
-    // Add internal tensor for Symlowp output.
-    std::string symlowpOutputName =
-        NNPIImporter::internalName_ +
-        nodeValueName(glowRowwiseFC->getResult()).c_str() + "_symlowp";
-    auto *outType = glowRowwiseFC->getResult().getType();
-    LOG_NNPI_IF_ERROR_RETURN_VALUE(
-        importer.addValue(symlowpOutputName, outType,
-                          /* alternativeLayout */ outType->dims().size() == 4,
-                          /* input */ false, /* output */ false, {}, {},
-                          /* forceSymlowp */ true),
-        "Failed to add value");
-
-    // Add convert op from Gemmlowp input to Symlowp.
-    std::string convertInputName = NNPIImporter::internalName_ +
-                                   glowRowwiseFC->getName().begin() +
-                                   "_convert_input";
-    std::string convertInputInputName =
-        nodeValueName(glowRowwiseFC->getInput());
-    if (!importer.hasChannelWiseConverter(convertInputInputName)) {
-      LOG_NNPI_IF_ERROR_RETURN_VALUE(
-          nnpiNetworkAddConvertOp(
-              importer.getNetwork(), convertInputName.c_str(),
-              convertInputInputName.c_str(), symlowpInputName.c_str()),
-          "Failed to add layer");
-      importer.addChannelWiseConverter(convertInputInputName);
-    }
-
-    // Add convert op from Symlowp output to Gemmlowp.
-    std::string convertOutputName = NNPIImporter::internalName_ +
-                                    glowRowwiseFC->getName().begin() +
-                                    "_convert_output";
-    std::string convertOutputOutputName =
-        nodeValueName(glowRowwiseFC->getResult());
-    LOG_NNPI_IF_ERROR_RETURN_VALUE(
-        nnpiNetworkAddConvertOp(
-            importer.getNetwork(), convertOutputName.c_str(),
-            symlowpOutputName.c_str(), convertOutputOutputName.c_str()),
-        "Failed to add layer");
-    importer.addChannelWiseConverter(convertOutputOutputName);
-
     // Create the weights with no offset tensor.
     // Assert weights & biases have no offset or all zeroes.
 
@@ -1534,17 +1497,14 @@ class RQFCNodeImporter : public INNPINodeImporter {
             nodeValueName(glowRowwiseFC->getInput()),
             nodeValueName(glowRowwiseFC->getWeights()),
             nodeValueName(glowRowwiseFC->getBias()),
-            symlowpInputName,
-            symlowpOutputName,
         },
         {
             nodeValueName(glowRowwiseFC->getResult()),
-            symlowpInputName,
-            symlowpOutputName,
         });
     return nnpiNetworkAddFullyConnectedOp(
         importer.getNetwork(), glowRowwiseFC->getName().begin(),
-        symlowpInputName.c_str(), symlowpOutputName.c_str(),
+        nodeValueName(glowRowwiseFC->getInput()).c_str(),
+        nodeValueName(glowRowwiseFC->getResult()).c_str(),
         nodeValueName(glowRowwiseFC->getWeights()).c_str(),
         glowRowwiseFC->getBias()
             ? nodeValueName(glowRowwiseFC->getBias()).c_str()
@@ -1560,7 +1520,6 @@ class ChannelwiseQuantizedConvolutionNodeImporter : public INNPINodeImporter {
         llvm::dyn_cast<ChannelwiseQuantizedConvolutionNode>(n);
     LOG_AND_RETURN_IF_NOT(ERROR, glowChannelwiseQuantizedConv, "Bad node type",
                           NNPI_INVALID_PARAM);
-
     LOG_AND_RETURN_IF_NOT(
         ERROR,
         !(glowChannelwiseQuantizedConv->getFilterOffsets()) ||
@@ -1597,60 +1556,6 @@ class ChannelwiseQuantizedConvolutionNodeImporter : public INNPINodeImporter {
         glowChannelwiseQuantizedConv->getStrides()[1]};
     uint32_t dilation[SPATIAL_DIMS2] = {1, 1}; // No dilation, default values
 
-    // Add internal tensor for Symlowp input.
-    std::string symlowpInputName =
-        NNPIImporter::internalName_ +
-        nodeValueName(glowChannelwiseQuantizedConv->getInput()).c_str() +
-        "_symlowp";
-    auto *inType = glowChannelwiseQuantizedConv->getInput().getType();
-    LOG_NNPI_IF_ERROR_RETURN_VALUE(
-        importer.addValue(symlowpInputName, inType,
-                          /* alternativeLayout */ inType->dims().size() == 4,
-                          /* input */ false, /* output */ false, {}, {},
-                          /* forceSymlowp */ true),
-        "Failed to add value");
-
-    // Add internal tensor for Symlowp output.
-    std::string symlowpOutputName =
-        NNPIImporter::internalName_ +
-        nodeValueName(glowChannelwiseQuantizedConv->getResult()).c_str() +
-        "_symlowp";
-    auto *outType = glowChannelwiseQuantizedConv->getResult().getType();
-    LOG_NNPI_IF_ERROR_RETURN_VALUE(
-        importer.addValue(symlowpOutputName, outType,
-                          /* alternativeLayout */ outType->dims().size() == 4,
-                          /* input */ false, /* output */ false, {}, {},
-                          /* forceSymlowp */ true),
-        "Failed to add value");
-
-    // Add convert op from Gemmlowp input to Symlowp.
-    std::string convertInputName =
-        NNPIImporter::internalName_ +
-        glowChannelwiseQuantizedConv->getName().begin() + "_convert_input";
-    std::string convertInputInputName =
-        nodeValueName(glowChannelwiseQuantizedConv->getInput());
-    if (!importer.hasChannelWiseConverter(convertInputInputName)) {
-      LOG_NNPI_IF_ERROR_RETURN_VALUE(
-          nnpiNetworkAddConvertOp(
-              importer.getNetwork(), convertInputName.c_str(),
-              convertInputInputName.c_str(), symlowpInputName.c_str()),
-          "Failed to add layer");
-      importer.addChannelWiseConverter(convertInputInputName);
-    }
-
-    // Add convert op from Symlowp output to Gemmlowp.
-    std::string convertOutputName =
-        NNPIImporter::internalName_ +
-        glowChannelwiseQuantizedConv->getName().begin() + "_convert_output";
-    std::string convertOutputOutputName =
-        nodeValueName(glowChannelwiseQuantizedConv->getResult());
-    LOG_NNPI_IF_ERROR_RETURN_VALUE(
-        nnpiNetworkAddConvertOp(
-            importer.getNetwork(), convertOutputName.c_str(),
-            symlowpOutputName.c_str(), convertOutputOutputName.c_str()),
-        "Failed to add layer");
-    importer.addChannelWiseConverter(convertOutputOutputName);
-
     // Create the weights with no offset tensor.
     // Assert weights & biases have no offset or all zeroes.
 
@@ -1694,18 +1599,15 @@ class ChannelwiseQuantizedConvolutionNodeImporter : public INNPINodeImporter {
             nodeValueName(glowChannelwiseQuantizedConv->getInput()),
             nodeValueName(glowChannelwiseQuantizedConv->getFilter()),
             nodeValueName(glowChannelwiseQuantizedConv->getBias()),
-            symlowpInputName,
-            symlowpOutputName,
         },
         {
             nodeValueName(glowChannelwiseQuantizedConv->getResult()),
-            symlowpInputName,
-            symlowpOutputName,
         });
 
     return nnpiNetworkAddConvolutionOp(
         importer.getNetwork(), glowChannelwiseQuantizedConv->getName().begin(),
-        symlowpInputName.c_str(), symlowpOutputName.c_str(),
+        nodeValueName(glowChannelwiseQuantizedConv->getInput()).c_str(),
+        nodeValueName(glowChannelwiseQuantizedConv->getResult()).c_str(),
         nodeValueName(glowChannelwiseQuantizedConv->getFilter()).c_str(),
         glowChannelwiseQuantizedConv->getBias()
             ? nodeValueName(glowChannelwiseQuantizedConv->getBias()).c_str()
@@ -1956,6 +1858,42 @@ class BatchOneHotNodeImporter : public INNPINodeImporter {
   }
 };
 
+class NNPICustomIANodeImporter : public INNPINodeImporter {
+public:
+  NNPIErrorCode importNode(Node *n, NNPIImporter &importer) override {
+    auto *glowIA = llvm::dyn_cast<NNPICustomIANode>(n);
+    LOG_AND_RETURN_IF_NOT(ERROR, glowIA, "Bad node type", NNPI_INVALID_PARAM);
+
+    auto numInputs = glowIA->getInputs().size();
+    NNPIObjectName inputs[numInputs];
+    LOG_AND_RETURN_IF_NOT(ERROR, inputs, "No inputs", NNPI_INVALID_PARAM);
+    std::unordered_set<std::string> inputTensors;
+    uint32_t i = 0;
+    for (const auto &nv : glowIA->getInputs()) {
+      auto nvName = nodeValueName(nv);
+      strncpy(inputs[i++], nvName.c_str(), sizeof(NNPIObjectName));
+      inputTensors.insert(nvName);
+    }
+
+    uint32_t numOutputs = 1;
+    NNPIObjectName outputs[numOutputs];
+    LOG_AND_RETURN_IF_NOT(ERROR, outputs, "No outputs", NNPI_INVALID_PARAM);
+    std::unordered_set<std::string> outputTensors;
+    auto nvName = nodeValueName(glowIA->getResult());
+    strncpy(outputs[0], nvName.c_str(), sizeof(NNPIObjectName));
+    outputTensors.insert(nvName);
+
+    importer.setUsedTensors(inputTensors, outputTensors);
+    NNPIErrorCode error = importer.addIAExtentionPath(glowIA->getIAPath());
+    LOG_AND_RETURN_IF_NOT(ERROR, error == NNPI_NO_ERROR,
+                          "Failed to store IA extension", NNPI_INVALID_PARAM);
+
+    auto res = nnpiNetworkAddCustomIAOp(
+        importer.getNetwork(), glowIA->getName().begin(), numInputs, inputs,
+        numOutputs, outputs, glowIA->getKernelName().c_str());
+    return res;
+  }
+};
 class NNPICustomDSPNodeImporter : public INNPINodeImporter {
 public:
   NNPIErrorCode importNode(Node *n, NNPIImporter &importer) override {
@@ -2079,7 +2017,10 @@ std::unordered_map<
     std::string,
     std::unique_ptr<INNPINodeImporter>>::value_type importerInit[] = {
     {"", nullptr},
-    {"Convolution", glow::make_unique<ConvolutionNodeImporter>()},
+    {"Convolution",
+     glow::make_unique<ConvolutionNodeImporter<ConvolutionNode, 2>>()},
+    {"Convolution3D",
+     glow::make_unique<ConvolutionNodeImporter<Convolution3DNode, 3>>()},
     {"Transpose", glow::make_unique<TransposeNodeImporter>()},
     {"MaxPool",
      glow::make_unique<PoolNodeImporter<glow::MaxPoolNode, NNPI_POOL_MAX>>()},
@@ -2160,6 +2101,7 @@ std::unordered_map<
     {"LengthsRangeFill", glow::make_unique<LengthsRangeFillNodeImporter>()},
     {"BatchOneHot", glow::make_unique<BatchOneHotNodeImporter>()},
     {"NNPICustomDSP", glow::make_unique<NNPICustomDSPNodeImporter>()},
+    {"NNPICustomIA", glow::make_unique<NNPICustomIANodeImporter>()},
     {"SpaceToDepth", glow::make_unique<SpaceToDepthNodeImporter>()},
     {"Clip", glow::make_unique<ClipNodeImporter>()},
     {"BatchNormalization", glow::make_unique<BatchNormalizationNodeImporter>()},
diff --git a/lib/Backends/NNPI/Importer.h b/lib/Backends/NNPI/Importer.h
index 539b7d8887..712bb1c1c8 100644
--- a/lib/Backends/NNPI/Importer.h
+++ b/lib/Backends/NNPI/Importer.h
@@ -106,6 +106,16 @@ class NNPIImporter {
     channelwiseConverters_.emplace(s);
   }
 
+  /// Add a path to AI extension (that will be loaded by the inference API).
+  /// Will fail if a file does not exist at this path, validity of the file is
+  /// checked only when the extension is loaded.
+  NNPIErrorCode addIAExtentionPath(const std::string &extPath);
+
+  /// Get AI extension paths.
+  const std::vector<std::string> &getIAExtensionPaths() const {
+    return iaExtensionPaths_;
+  }
+
 private:
   /// Map of named external tensors (inputs, outputs, weights, etc...).
   std::unordered_map<std::string, const Tensor *> constants_;
@@ -135,6 +145,9 @@ class NNPIImporter {
   /// an input is feeding into more than one channelwise ops. 2. an output of
   /// one channelwise op is consumed by another channelwise op.
   std::unordered_set<std::string> channelwiseConverters_;
+
+  /// A list of IA extensions that need to be loaded by the device.
+  std::vector<std::string> iaExtensionPaths_;
 };
 
 /// Interface class for all node specific importers.
diff --git a/lib/Backends/NNPI/InferenceContext.cpp b/lib/Backends/NNPI/InferenceContext.cpp
index 6790ebe3ce..68c0d0604c 100644
--- a/lib/Backends/NNPI/InferenceContext.cpp
+++ b/lib/Backends/NNPI/InferenceContext.cpp
@@ -30,7 +30,7 @@ namespace runtime {
 InferenceContext::InferenceContext()
     : nnpiNetwork_(NNPI_INVALID_NNPIHANDLE), device_(NNPI_INVALID_NNPIHANDLE),
       inferCmd_(NNPI_INVALID_NNPIHANDLE), commandList_(NNPI_INVALID_NNPIHANDLE),
-      deviceTracing_(nullptr), deviceOptions_(nullptr) {}
+      deviceOptions_(nullptr) {}
 
 InferenceContext::~InferenceContext() {
   if (deviceOptions_ && deviceOptions_->inferOnDevice) {
@@ -52,7 +52,6 @@ bool InferenceContext::init(
     NNPIDeviceContext device,
     const std::unordered_set<const Placeholder *> &partialInputs,
     const std::unordered_set<const Placeholder *> &staticInputs,
-    std::shared_ptr<NNPIDeviceTracing> deviceTracing,
     StaticPlaceholderMap *staticPlaceholderMap,
     std::shared_ptr<NNPIDeviceOptions> deviceOptions,
     const std::string &functionName, unsigned deviceId,
@@ -63,7 +62,6 @@ bool InferenceContext::init(
   device_ = device;
   compilationConfig_ = config;
   partialInputs_ = &partialInputs;
-  deviceTracing_ = deviceTracing;
   functionName_ = functionName;
 
   // Initialize trace context titles with device ID.
@@ -309,6 +307,8 @@ bool InferenceContext::init(
 void InferenceContext::execute(RunIdentifierTy runId,
                                std::unique_ptr<ExecutionContext> ctx,
                                runtime::ResultCBTy resultCB) {
+  std::string traceBackendExecuteStr =
+      llvm::formatv("{0} {1:x}", traceBackendExecuteContextName_, ctx.get());
 
   std::map<std::string, std::string> attributes;
 
@@ -319,7 +319,7 @@ void InferenceContext::execute(RunIdentifierTy runId,
   }
 
   TRACE_EVENT_SCOPE_NAMED(ctx->getTraceContext(), TraceLevel::REQUEST,
-                          traceBackendExecuteContextName_, traceBlock);
+                          traceBackendExecuteStr, traceBlock);
   for (const auto &iter : attributes) {
     traceBlock.addArg(iter.first, iter.second);
   }
@@ -329,9 +329,6 @@ void InferenceContext::execute(RunIdentifierTy runId,
         llvm::formatv("Inf ctx - device: {0}: {1}", deviceId_, functionName_)
             .str());
   }
-  if (deviceTracing_) {
-    deviceTracing_->start(ctx->getTraceContext(), device_);
-  }
 
   // Pre inference input preparation.
   PlaceholderBindings &bindings = *ctx->getPlaceholderBindings();
@@ -393,15 +390,16 @@ void InferenceContext::execute(RunIdentifierTy runId,
     }
     rawInputs.push_back(in->getHostPtr());
   }
-
+  std::string inferContext = traceInferenceContextName_;
   // Inference.
   if (deviceOptions_->inferOnDevice) {
     if (deviceOptions_->enabledCommandLists < 1) {
       // No command lists (schedule individual commands).
+      inferContext = llvm::formatv("{0} {1:x}", inferContext, inferCmd_);
       TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY,
                       tracePreProcessContextName_);
       TRACE_EVENT_BEGIN_ATTR(ctx->getTraceContext(), TraceLevel::OPERATOR,
-                             traceInferenceContextName_, attributes);
+                             inferContext, attributes);
       // Queue inference.
       LOG_AND_CALLBACK_EXECUTE_NNPI_INF_IF_ERROR(
           nnpiInferCommandQueue(inferCmd_, 0), "Failed to queue infer command.",
@@ -429,11 +427,11 @@ void InferenceContext::execute(RunIdentifierTy runId,
           usedConfigs++;
         }
       }
-
+      inferContext = llvm::formatv("{0} {1:x}", inferContext, commandList_);
       TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY,
                       tracePreProcessContextName_);
       TRACE_EVENT_BEGIN_ATTR(ctx->getTraceContext(), TraceLevel::OPERATOR,
-                             traceInferenceContextName_, attributes);
+                             inferContext, attributes);
       // Queue Command list
       LOG_AND_CALLBACK_EXECUTE_NNPI_INF_IF_ERROR(
           nnpiCommandListQueue(commandList_, &(cmdConfigs_.at(0)), usedConfigs),
@@ -482,15 +480,18 @@ void InferenceContext::execute(RunIdentifierTy runId,
   } else if (!deviceOptions_->useIceT) {
     // Infer on ice-ref.
 
+    // Ice-ref not re-entrant - To be removed once ICE-29869 is implemented
+    static std::mutex icerefMutex;
+    std::lock_guard<std::mutex> guard(icerefMutex);
+
     for (auto &out : outputResources_) {
       // Collect output ptrs for ICE-Ref
       rawOutputs.push_back(out->getHostPtr());
     }
-
     TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY,
-                    TRACING_PRE_PROCESS);
+                    tracePreProcessContextName_);
     TRACE_EVENT_BEGIN_ATTR(ctx->getTraceContext(), TraceLevel::OPERATOR,
-                           TRACING_INFERENCE, attributes);
+                           inferContext, attributes);
     LOG_AND_CALLBACK_EXECUTE_NNPI_IF_ERROR(
         nnpiNetworkInferOnHost(nnpiNetwork_, &(rawInputs[0]), rawInputs.size(),
                                &(rawOutputs[0]), rawOutputs.size(),
@@ -501,11 +502,9 @@ void InferenceContext::execute(RunIdentifierTy runId,
     // Nothing else to do here.
   }
 
-  TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::OPERATOR,
-                  traceInferenceContextName_);
+  TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::OPERATOR, inferContext);
   TRACE_EVENT_BEGIN_ATTR(ctx->getTraceContext(), TraceLevel::COPY,
                          tracePostProcessContextName_, attributes);
-
   // Post inference output handling.
   for (unsigned i = 0, e = outputResources_.size(); i < e; ++i) {
     auto *t = bindings.get(netOutputPlaceholders_[i]);
@@ -518,9 +517,6 @@ void InferenceContext::execute(RunIdentifierTy runId,
 
   TRACE_EVENT_END(ctx->getTraceContext(), TraceLevel::COPY,
                   tracePostProcessContextName_);
-  if (deviceTracing_) {
-    deviceTracing_->stopAndUpdate(ctx->getTraceContext(), device_);
-  }
   TRACE_EVENT_SCOPE_END_NAMED(traceBlock); // we move context in the line below
 
   // Invoke CB.
diff --git a/lib/Backends/NNPI/InferenceContext.h b/lib/Backends/NNPI/InferenceContext.h
index eae9dee660..a67a3d69de 100644
--- a/lib/Backends/NNPI/InferenceContext.h
+++ b/lib/Backends/NNPI/InferenceContext.h
@@ -52,9 +52,6 @@ class InferenceContext {
   /// Set of inputs that are static tensors.
   std::unordered_set<const Placeholder *> staticInputs_;
 
-  /// Device tracing handler.
-  std::shared_ptr<NNPIDeviceTracing> deviceTracing_;
-
   /// NNPI Device configuration.
   std::shared_ptr<NNPIDeviceOptions> deviceOptions_;
 
@@ -95,7 +92,6 @@ class InferenceContext {
             NNPIDeviceContext device,
             const std::unordered_set<const Placeholder *> &partialInputs,
             const std::unordered_set<const Placeholder *> &staticInputs,
-            std::shared_ptr<NNPIDeviceTracing> deviceTracing,
             StaticPlaceholderMap *staticPlaceholderMap,
             std::shared_ptr<NNPIDeviceOptions> deviceOptions,
             const std::string &functionName, unsigned deviceId,
diff --git a/lib/Backends/NNPI/InferencePool.cpp b/lib/Backends/NNPI/InferencePool.cpp
index 4311dc6057..387ecf775b 100644
--- a/lib/Backends/NNPI/InferencePool.cpp
+++ b/lib/Backends/NNPI/InferencePool.cpp
@@ -27,6 +27,38 @@
 namespace glow {
 namespace runtime {
 
+static bool isEmptyDeviceNetworkConfig(const NNPIDeviceNetworkConfig &cfg) {
+  if (cfg.disableECC != 0) {
+    return false;
+  }
+
+  if (cfg.pnpHints.ringFrequencyPrio != 0.f) {
+    return false;
+  }
+
+  const int numIceBO = sizeof(cfg.pnpHints.iceBOFrequencyPrio) /
+                       sizeof(cfg.pnpHints.iceBOFrequencyPrio[0]);
+  for (int i = 0; i < numIceBO; i++) {
+    if (cfg.pnpHints.iceBOFrequencyPrio[i] != 0.f) {
+      return false;
+    }
+  }
+
+  const int numIA = sizeof(cfg.pnpHints.IAFrequencyPrio) /
+                    sizeof(cfg.pnpHints.IAFrequencyPrio[0]);
+  for (unsigned i = 0; i < numIA; i++) {
+    if (cfg.pnpHints.IAFrequencyPrio[i] != 0.f) {
+      return false;
+    }
+  }
+
+  if (cfg.pnpHints.DDRBandwidth != 0.f) {
+    return false;
+  }
+
+  return true;
+}
+
 InferencePoolEnv::InferencePoolEnv()
     : deviceOptions_(nullptr), nnpiCompiledFunction_(nullptr),
       staticPlaceholderMap_(nullptr) {}
@@ -42,7 +74,6 @@ InferencePoolEnv::~InferencePoolEnv() {
 }
 
 Error InferencePoolEnv::init(NNPIAdapter adapter, NNPIDeviceContext device,
-                             std::shared_ptr<NNPIDeviceTracing> deviceTracing,
                              CompiledFunction *compiledFunction,
                              StaticPlaceholderMap *staticPlaceholderMap,
                              std::shared_ptr<NNPIDeviceOptions> deviceOptions,
@@ -64,7 +95,6 @@ Error InferencePoolEnv::init(NNPIAdapter adapter, NNPIDeviceContext device,
   size_t numWorkers = deviceOptions_->inferOnDevice ? optionsNumWorkers : 1;
   workersPool_ = glow::make_unique<folly::CPUThreadPoolExecutor>(
       numWorkers, std::make_shared<folly::NamedThreadFactory>("NNPI-worker"));
-  deviceTracing_ = deviceTracing;
   staticPlaceholderMap_ = staticPlaceholderMap;
 
   inferenceContexts_.resize(numWorkers);
@@ -76,6 +106,16 @@ Error InferencePoolEnv::init(NNPIAdapter adapter, NNPIDeviceContext device,
   // Create host network.
   NNPIHostNetwork hostNetwork(NNPI_INVALID_NNPIHANDLE);
   if (deviceOptions_->inferOnDevice) {
+    // Load IA extenstions.
+    for (auto &extensionPath : nnpiCompiledFunction_->getIAExtensionPaths()) {
+      NNPIExtension ext;
+      LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR(
+          nnpiExtensionCreate(extensionPath.c_str(), &ext),
+          "Failed to create NNPI IA Extension object");
+      LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR(
+          nnpiDeviceContextLoadExtension(device, ext),
+          "Failed to load NNPI IA Extension object");
+    }
     // Create NNPI host network (load compiled binary).
     auto filename = nnpiCompiledFunction_->getCompilationFilename();
     if (filename.empty()) // Create network from memory.
@@ -106,9 +146,32 @@ Error InferencePoolEnv::init(NNPIAdapter adapter, NNPIDeviceContext device,
     }
 
     DBG_MEM_USAGE("call nnpiDeviceNetworkCreate");
+    NNPIDeviceNetworkConfig cfg =
+        nnpiCompiledFunction_->getDeviceNetworkConfig();
+    NNPIDeviceNetworkConfig *pCfg = nullptr;
+    if (!isEmptyDeviceNetworkConfig(cfg)) {
+      pCfg = &cfg;
+      LOG(INFO) << "DeviceNetwork PnP: "
+                << "\n";
+      LOG(INFO) << "  Ring: " << cfg.pnpHints.ringFrequencyPrio << "\n";
+      LOG(INFO) << "  ICEBO 0: " << cfg.pnpHints.iceBOFrequencyPrio[0] << "\n";
+      LOG(INFO) << "  ICEBO 1: " << cfg.pnpHints.iceBOFrequencyPrio[1] << "\n";
+      LOG(INFO) << "  ICEBO 2: " << cfg.pnpHints.iceBOFrequencyPrio[2] << "\n";
+      LOG(INFO) << "  ICEBO 3: " << cfg.pnpHints.iceBOFrequencyPrio[3] << "\n";
+      LOG(INFO) << "  ICEBO 4: " << cfg.pnpHints.iceBOFrequencyPrio[4] << "\n";
+      LOG(INFO) << "  ICEBO 5: " << cfg.pnpHints.iceBOFrequencyPrio[5] << "\n";
+      LOG(INFO) << "  IA 0: " << cfg.pnpHints.IAFrequencyPrio[0] << "\n";
+      LOG(INFO) << "  IA 1: " << cfg.pnpHints.IAFrequencyPrio[1] << "\n";
+      LOG(INFO) << "  DDR: " << cfg.pnpHints.DDRBandwidth << "\n";
+      LOG(INFO)
+          << "  Resource reservation: "
+          << nnpiCompiledFunction_->getCompilationOptions().reserveResources
+          << "\n";
+    }
+
     // Create NNPI device network (deploy to device).
     LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR(
-        nnpiDeviceNetworkCreate(device, hostNetwork, nullptr, &deviceNetwork_),
+        nnpiDeviceNetworkCreate(device, hostNetwork, pCfg, &deviceNetwork_),
         "Failed to create NNPI device network");
     DBG_MEM_USAGE("done nnpiDeviceNetworkCreate");
     if (nnpiCompiledFunction_->getCompilationOptions().reserveResources) {
@@ -187,8 +250,8 @@ Error InferencePoolEnv::init(NNPIAdapter adapter, NNPIDeviceContext device,
         nnpiCompiledFunction_->getCompiledNetworkHandle(),
         nnpiCompiledFunction_->getCompilationConfig(), deviceNetwork_, adapter,
         device, nnpiCompiledFunction_->getPartialInputs(),
-        nnpiCompiledFunction_->getStaticInputs(), deviceTracing_,
-        staticPlaceholderMap_, deviceOptions_, functionName_, deviceId_);
+        nnpiCompiledFunction_->getStaticInputs(), staticPlaceholderMap_,
+        deviceOptions_, functionName_, deviceId_);
     if (!success) {
       return MAKE_ERR("Failed to initialize inferece context");
     }
@@ -264,14 +327,13 @@ InferencePoolEnv::createDetachedInferenceContext(PlaceholderUsageMap &phUsage) {
 
   InferenceContext *infCtx = new InferenceContext();
 
-  if (!infCtx->init(inputDesc_, outputDesc_,
-                    nnpiCompiledFunction_->getCompiledNetworkHandle(),
-                    nnpiCompiledFunction_->getCompilationConfig(),
-                    deviceNetwork_, adapter_, device_,
-                    nnpiCompiledFunction_->getPartialInputs(),
-                    nnpiCompiledFunction_->getStaticInputs(), deviceTracing_,
-                    staticPlaceholderMap_, deviceOptions_, functionName_,
-                    deviceId_, &phUsage)) {
+  if (!infCtx->init(
+          inputDesc_, outputDesc_,
+          nnpiCompiledFunction_->getCompiledNetworkHandle(),
+          nnpiCompiledFunction_->getCompilationConfig(), deviceNetwork_,
+          adapter_, device_, nnpiCompiledFunction_->getPartialInputs(),
+          nnpiCompiledFunction_->getStaticInputs(), staticPlaceholderMap_,
+          deviceOptions_, functionName_, deviceId_, &phUsage)) {
     delete infCtx;
     ASSERT_WITH_MSG(infCtx, "Failed to initialize detached inference context");
     return nullptr;
diff --git a/lib/Backends/NNPI/InferencePool.h b/lib/Backends/NNPI/InferencePool.h
index 2510f96f85..2aa32c45ea 100644
--- a/lib/Backends/NNPI/InferencePool.h
+++ b/lib/Backends/NNPI/InferencePool.h
@@ -38,7 +38,6 @@ class InferencePoolEnv {
   std::vector<InferenceContext *> freeContexts_;
   std::mutex freeContextsLock_;
   NNPIDeviceNetwork deviceNetwork_;
-  std::shared_ptr<NNPIDeviceTracing> deviceTracing_;
   std::shared_ptr<NNPIDeviceOptions> deviceOptions_;
   unsigned deviceId_;
   ResourceDescVec inputDesc_;
@@ -53,7 +52,6 @@ class InferencePoolEnv {
   InferencePoolEnv();
   ~InferencePoolEnv();
   Error init(NNPIAdapter adapter, NNPIDeviceContext device,
-             std::shared_ptr<NNPIDeviceTracing> deviceTracing,
              CompiledFunction *compiledFunction,
              StaticPlaceholderMap *staticPlaceholderMap,
              std::shared_ptr<NNPIDeviceOptions> deviceOptions,
diff --git a/lib/Backends/NNPI/NNPI.cpp b/lib/Backends/NNPI/NNPI.cpp
index 1863c0768b..543c6e5c19 100644
--- a/lib/Backends/NNPI/NNPI.cpp
+++ b/lib/Backends/NNPI/NNPI.cpp
@@ -465,6 +465,7 @@ bool NNPIBackend::isOpSupported(const NodeInfo &NI) const {
            (NI.getInElemTy(BatchOneHotNode::LengthsIdx) == ElemKind::Int32ITy);
 
   case Kinded::Kind::NNPICustomDSPNodeKind:
+  case Kinded::Kind::NNPICustomIANodeKind:
     return true;
 
   case Kinded::Kind::SpaceToDepthNodeKind:
@@ -499,6 +500,7 @@ bool NNPIBackend::shouldLower(const Node *N) const {
   case Kinded::Kind::TanhNodeKind:
   case Kinded::Kind::ReluNodeKind:
   case Kinded::Kind::ConvolutionNodeKind:
+  case Kinded::Kind::Convolution3DNodeKind:
   case Kinded::Kind::TileNodeKind:
   case Kinded::Kind::LogNodeKind:
   case Kinded::Kind::ReplaceNaNNodeKind:
diff --git a/lib/Backends/NNPI/NNPICompiledFunction.cpp b/lib/Backends/NNPI/NNPICompiledFunction.cpp
index 68320e8a7c..e67dfe9edd 100644
--- a/lib/Backends/NNPI/NNPICompiledFunction.cpp
+++ b/lib/Backends/NNPI/NNPICompiledFunction.cpp
@@ -61,6 +61,24 @@ static void trySetDeviceVersion(NNPICompilationOptions &compilationOptions) {
   compilationOptions.deviceVersion.setVal(*devVerOrErr + 1);
 }
 
+/// Update device network config from the compilation config
+static NNPIDeviceNetworkConfig parseDeviceNetworkConfig(
+    const glow::NNPICompilationOptions &compilationOptions) {
+  NNPIDeviceNetworkConfig cfg;
+  std::memset(&cfg, 0, sizeof(cfg));
+  cfg.pnpHints.ringFrequencyPrio = compilationOptions.ringPrio;
+  cfg.pnpHints.iceBOFrequencyPrio[0] = compilationOptions.iceBOPrio0;
+  cfg.pnpHints.iceBOFrequencyPrio[1] = compilationOptions.iceBOPrio1;
+  cfg.pnpHints.iceBOFrequencyPrio[2] = compilationOptions.iceBOPrio2;
+  cfg.pnpHints.iceBOFrequencyPrio[3] = compilationOptions.iceBOPrio3;
+  cfg.pnpHints.iceBOFrequencyPrio[4] = compilationOptions.iceBOPrio4;
+  cfg.pnpHints.iceBOFrequencyPrio[5] = compilationOptions.iceBOPrio5;
+  cfg.pnpHints.IAFrequencyPrio[0] = compilationOptions.iaPrio0;
+  cfg.pnpHints.IAFrequencyPrio[1] = compilationOptions.iaPrio1;
+  cfg.pnpHints.DDRBandwidth = compilationOptions.ddrBandwidth;
+  return cfg;
+}
+
 Error NNPICompiledFunction::updateCompilationConfigFromOptions(
     NNPICompilationOptions &compilationOptions) {
   if (compilationOptions.showVars) {
@@ -103,6 +121,11 @@ Error NNPICompiledFunction::updateCompilationConfigFromOptions(
             compilationOptions.debugCompileConfigFile.get().c_str(),
             sizeof(config_.debugConfigFile));
   }
+
+  config_.disableSLSOnIA = compilationOptions.disableSLSOnIA;
+  config_.enableLightweightCompilation = compilationOptions.lightCompilation;
+  config_.dumpDotFiles = compilationOptions.dumpDotFiles;
+
   return Error::success();
 }
 
@@ -254,6 +277,7 @@ Error NNPICompiledFunction::compile(Function *F, const BackendOptions &opts) {
 
   NNPIImporter importer(compilationOptions_);
   network_ = importer.importFunction(F, newOpts);
+  iaExtensionPaths_ = importer.getIAExtensionPaths();
 
   LOG_IF_INVALID_HANDLE_RETURN_LLVMERROR(network_, "Failed to import function");
   // Setting the network name.
@@ -350,6 +374,18 @@ Error NNPICompiledFunction::compile(Function *F, const BackendOptions &opts) {
                                    compilationFileName_.c_str(), NULL),
           "Failed NNPI Compile");
     }
+
+    // Update compilation info after NNPI compilation.
+    if (compilationOptions_.dumpCompilationInfo ||
+        compilationOptions_.lightCompilation) {
+      if (!updateCompilationInfo()) {
+        // Only issuing a warning (soft fail)
+        LOG(WARNING) << "Failed to update NNPI compilation info";
+      } else if (compilationOptions_.dumpCompilationInfo) {
+        LONG_LOG(INFO, compilationInfo_.dump(networkName));
+      }
+    }
+
     if (compilationOptions_.inferOnDevice) {
       DBG_MEM_USAGE("NNPICompiledFunction destroy network");
       // NNPINetwork is not needed anymore on the inferfence api path.
@@ -375,6 +411,10 @@ Error NNPICompiledFunction::compile(Function *F, const BackendOptions &opts) {
       staticInputs_.insert(P);
     }
   }
+
+  // Update device network config.
+  devNetConfig_ = parseDeviceNetworkConfig(compilationOptions_);
+
   return Error::success();
 }
 
@@ -382,6 +422,7 @@ NNPICompiledFunction::NNPICompiledFunction(Function *F)
     : CompiledFunction(runtime::RuntimeBundle::create(*F)),
       compilationOptions_({}) {
   std::memset(&config_, 0, sizeof(config_));
+  std::memset(&devNetConfig_, 0, sizeof(devNetConfig_));
 };
 
 NNPICompiledFunction::~NNPICompiledFunction() {
@@ -413,3 +454,165 @@ void NNPICompiledFunction::freeCompilationResources() {
   unlockCompiledStream();
   DBG_MEM_USAGE("[After] freeCompilationResources ");
 }
+
+bool NNPICompiledFunction::updateCompilationInfo() {
+  // Clear existing info.
+  compilationInfo_.clear();
+
+  if (network_ == NNPI_INVALID_NNPIHANDLE) {
+    LOG(ERROR) << "Invalid NNPINetwork";
+    return false;
+  }
+
+  // Collect operators.
+  uint64_t numOps = 0;
+  LOG_NNPI_IF_ERROR_RETURN_FALSE(nnpiNetworkGetOpNum(network_, &numOps),
+                                 "Failed to get num ops");
+  for (uint64_t op = 0; op < numOps; op++) {
+    NNPIOpInfo opInfo;
+    LOG_NNPI_IF_ERROR_RETURN_FALSE(nnpiNetworkGetOpInfo(network_, op, &opInfo),
+                                   "Failed to get op info");
+    NNPICompiledOp compiledOp;
+    compiledOp.name = std::string(opInfo.name);
+    compiledOp.type = std::string(opInfo.type);
+    compiledOp.coreIndex = opInfo.coreIndex;
+    compiledOp.iceBo = opInfo.iceBo;
+    compiledOp.execType = opInfo.executionType;
+    for (uint32_t t = 0; t < opInfo.numTensors; t++) {
+      NNPITensorInfo tensorInfo;
+      LOG_NNPI_IF_ERROR_RETURN_FALSE(
+          nnpiNetworkGetOpTensorInfo(network_, op, t, &tensorInfo),
+          "Failed to get tensor info");
+      NNPICompiledTensor compiledTensor;
+      compiledTensor.name = std::string(tensorInfo.name);
+      compiledTensor.type = std::string(tensorInfo.type);
+      compiledTensor.allocType = tensorInfo.allocation;
+      for (uint32_t d = 0; d < tensorInfo.numDims; d++) {
+        compiledTensor.shape.push_back(tensorInfo.dims[d]);
+      }
+      switch (tensorInfo.usage) {
+      case NNPI_TENSOR_USAGE_INPUT:
+        compiledOp.inputs.push_back(compiledTensor);
+        break;
+      case NNPI_TENSOR_USAGE_OUTPUT:
+        compiledOp.outputs.push_back(compiledTensor);
+        break;
+      default:
+        LOG(WARNING) << "Invalid tensor usage";
+        break;
+      }
+    }
+    compilationInfo_.ops.insert({compiledOp.name, compiledOp});
+  }
+
+  // Collect dependencies.
+  uint64_t numDeps = 0;
+  LOG_NNPI_IF_ERROR_RETURN_FALSE(
+      nnpiNetworkGetOpDependenciesNum(network_, &numDeps),
+      "Failed to get num dependencies");
+
+  for (uint64_t dep = 0; dep < numDeps; dep++) {
+    NNPIObjectName src;
+    NNPIObjectName dst;
+    LOG_NNPI_IF_ERROR_RETURN_FALSE(
+        nnpiNetworkGetOpDependency(network_, dep, src, dst),
+        "Failed to get op dependency");
+    compilationInfo_.opDependencies.push_back(
+        {std::string(src), std::string(dst)});
+  }
+
+  return true;
+}
+
+std::string NNPICompiledTensor::dump() const {
+  std::stringstream stream;
+  stream << "name: " << name << ", type: " << type << " (";
+  for (const auto &d : shape) {
+    stream << d << ",";
+  }
+  if (shape.size() > 0) {
+    stream.seekp(-1, stream.cur);
+  }
+  stream << "), allocation: ";
+  switch (allocType) {
+  case NNPI_ALLOCATION_DEFAULT:
+    stream << "Default";
+    break;
+  case NNPI_ALLOCATION_DRAM:
+    stream << "DRAM";
+    break;
+  case NNPI_ALLOCATION_ECC_DRAM:
+    stream << "ECC DRAM";
+    break;
+  case NNPI_ALLOCATION_LLC:
+  case NNPI_ALLOCATION_LLC_CLOS0:
+  case NNPI_ALLOCATION_LLC_CLOS1:
+  case NNPI_ALLOCATION_LLC_CLOS2:
+  case NNPI_ALLOCATION_LLC_CLOS3:
+    stream << "LLC";
+    break;
+  case NNPI_ALLOCATION_SRAM:
+    stream << "SRAM";
+    break;
+  case NNPI_ALLOCATION_INTERNAL:
+    stream << "Internal";
+    break;
+  default:
+    stream << "Unknown";
+    break;
+  }
+  return stream.str();
+}
+
+std::string NNPICompiledOp::dump() const {
+  std::stringstream stream;
+  stream << "  [Op] name: " << name << ", type: " << type << ", exec: ";
+  switch (execType) {
+  case NNPI_EXECUTION_IA:
+    stream << "IA";
+    break;
+  case NNPI_EXECUTION_DSP:
+    stream << "DSP";
+    break;
+  case NNPI_EXECUTION_DELPHI:
+    stream << "Delphi";
+    break;
+  case NNPI_EXECUTION_DSE:
+    stream << "DSE";
+    break;
+  case NNPI_EXECUTION_COMBINED:
+    stream << "Combined";
+    break;
+  case NNPI_EXECUTION_NOT_SET:
+    stream << "NotSet";
+    break;
+  default:
+    stream << "Unknown";
+    break;
+  }
+  stream << ", core: " << coreIndex << ", iceBo: " << iceBo << "\n";
+  for (const auto &in : inputs) {
+    stream << "    [Input] " << in.dump() << "\n";
+  }
+  for (const auto &out : outputs) {
+    stream << "    [Output] " << out.dump() << "\n";
+  }
+
+  return stream.str();
+}
+
+std::string NNPICompilationInfo::dump(const std::string &functionName) const {
+  std::stringstream stream;
+  stream << "[Start] NNPI Compilation Info for function: \"" << functionName
+         << "\":\n";
+  for (const auto &op : ops) {
+    stream << op.second.dump();
+  }
+  for (const auto &dep : opDependencies) {
+    stream << "  [Dep] " << dep.first << " -> " << dep.second << "\n";
+  }
+  stream << "[End] NNPI Compilation Info for function: \"" << functionName
+         << "\":\n";
+
+  return stream.str();
+}
diff --git a/lib/Backends/NNPI/NNPICompiledFunction.h b/lib/Backends/NNPI/NNPICompiledFunction.h
index d4838981c2..a9ec1de6b2 100644
--- a/lib/Backends/NNPI/NNPICompiledFunction.h
+++ b/lib/Backends/NNPI/NNPICompiledFunction.h
@@ -21,6 +21,7 @@
 #include "glow/Backend/CompiledFunction.h"
 #include "glow/Backends/BackendOptions.h"
 #include "glow/ExecutionContext/ExecutionContext.h"
+#include "nnpi_inference_types.h"
 #include "nnpi_transformer.h"
 #include <map>
 #include <memory>
@@ -28,6 +29,38 @@
 
 namespace glow {
 
+/// Struct containing details exported for a compiled tensor.
+struct NNPICompiledTensor {
+  std::string name;
+  std::string type;
+  std::vector<uint32_t> shape;
+  NNPI_ALLOCATION_TYPE allocType;
+  std::string dump() const;
+};
+
+/// Struct containing details exported for a compiled operator.
+struct NNPICompiledOp {
+  std::string name;
+  std::string type;
+  NNPI_EXECUTION_TYPE execType;
+  int32_t coreIndex;
+  int32_t iceBo;
+  std::vector<NNPICompiledTensor> inputs;
+  std::vector<NNPICompiledTensor> outputs;
+  std::string dump() const;
+};
+
+/// Collection of exported details for compiled functions.
+struct NNPICompilationInfo {
+  std::map<std::string, NNPICompiledOp> ops;
+  std::vector<std::pair<std::string, std::string>> opDependencies;
+  std::string dump(const std::string &functionName) const;
+  void clear() {
+    ops.clear();
+    opDependencies.clear();
+  }
+};
+
 /// Function "compiled" for execution by the NNPI backend.
 class NNPICompiledFunction final : public CompiledFunction {
 public:
@@ -85,6 +118,18 @@ class NNPICompiledFunction final : public CompiledFunction {
     return outputNames_;
   }
 
+  NNPIDeviceNetworkConfig getDeviceNetworkConfig() const {
+    return devNetConfig_;
+  }
+
+  const std::vector<std::string> &getIAExtensionPaths() const {
+    return iaExtensionPaths_;
+  }
+
+  const NNPICompilationInfo &getCompilationInfo() const {
+    return compilationInfo_;
+  }
+
 private:
   NNPINetwork network_;
   NNPICompilationConfig config_;
@@ -96,6 +141,9 @@ class NNPICompiledFunction final : public CompiledFunction {
   std::string compilationFileName_;
   std::vector<std::string> inputNames_;
   std::vector<std::string> outputNames_;
+  NNPIDeviceNetworkConfig devNetConfig_;
+  std::vector<std::string> iaExtensionPaths_;
+  NNPICompilationInfo compilationInfo_;
 
   Error updateCompilationConfigFromOptions(
       NNPICompilationOptions &compilationOptions);
@@ -105,6 +153,9 @@ class NNPICompiledFunction final : public CompiledFunction {
   Error
   setupCompilationHints(const Function *F,
                         const BackendSpecificNodeInfo &backendSpecificNodeInfo);
+
+  /// Update the internal compilation info object. Return true iff successful.
+  bool updateCompilationInfo();
   ///@}
 };
 } // end namespace glow
diff --git a/lib/Backends/NNPI/NNPIDeviceManager.cpp b/lib/Backends/NNPI/NNPIDeviceManager.cpp
index a3915184f9..995fec18fd 100644
--- a/lib/Backends/NNPI/NNPIDeviceManager.cpp
+++ b/lib/Backends/NNPI/NNPIDeviceManager.cpp
@@ -119,9 +119,6 @@ Error NNPIDeviceManager::init() {
     LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR(
         nnpiDeviceContextCreate(adapter_, deviceId_, &device_),
         "Failed to create NNPI Device");
-    if (deviceOptions_->enabledDeviceTracing) {
-      deviceTracing_ = NNPIDeviceTracing::getForDevice(deviceId_);
-    }
     NNPIDeviceInfo deviceInfo;
     LOG_NNPI_INF_IF_ERROR_RETURN_LLVMERROR(
         nnpiDeviceGetInfo(deviceId_, &deviceInfo),
@@ -187,8 +184,8 @@ void NNPIDeviceManager::addNetwork(const Module *module,
     functions_.emplace(func.first, func.second);
     usedMemoryBytes_ += functionCost_; // TODO:: static moduleSize.
     auto err = inferenceEnvs_[func.first].init(
-        adapter_, device_, deviceTracing_, func.second, &staticPlaceholders_,
-        deviceOptions_, func.first, deviceId_);
+        adapter_, device_, func.second, &staticPlaceholders_, deviceOptions_,
+        func.first, deviceId_);
     if (err) {
       functions_.erase(func.first);
       lock.unlock();
@@ -278,7 +275,13 @@ uint64_t NNPIDeviceManager::getAvailableMemory() const {
       LOG_NNPI_INF_IF_ERROR(res, "Failed to read available memory from device.")
       return 0;
     }
-    return static_cast<uint64_t>(devStatus.availableUnprotectedMemory) * KB;
+    const auto availableMem =
+        static_cast<uint64_t>(devStatus.availableUnprotectedMemory) * KB;
+    if (availableMem == 0) {
+      LOG(WARNING) << "NNPI Device " << deviceId_
+                   << " available memory: " << availableMem;
+    }
+    return availableMem;
   }
   auto freeMemory = getMaximumMemory();
   for (const auto &p : functions_) {
@@ -309,8 +312,9 @@ void NNPIDeviceManager::transferStaticPlaceholderToDevice(
 };
 
 Error NNPIDeviceManager::startDeviceTrace(TraceContext *traceContext) {
-  if (!NNPIDeviceTracing::getForDevice(deviceId_)->start(traceContext,
-                                                         device_)) {
+  if (!NNPIDeviceTracing::getForDevice(deviceId_)->start(
+          traceContext, device_, true /* Software traces are always enabled. */,
+          deviceOptions_->hardwareTraces)) {
     return MAKE_ERR("Failed to start NNPI device trace.");
   }
   return Error::success();
diff --git a/lib/Backends/NNPI/NNPIDeviceManager.h b/lib/Backends/NNPI/NNPIDeviceManager.h
index 2ad2a717eb..26fbef6b4a 100644
--- a/lib/Backends/NNPI/NNPIDeviceManager.h
+++ b/lib/Backends/NNPI/NNPIDeviceManager.h
@@ -68,8 +68,6 @@ class NNPIDeviceManager : public DeviceManager {
   NNPIDeviceContext device_;
   /// Lock to synchronize function adding/removing to/from the device manager.
   std::mutex functionMapMutex_;
-  /// Device Tracing control.
-  std::shared_ptr<NNPIDeviceTracing> deviceTracing_;
   /// Static placeholders known by the device manager (the device manager
   /// doesn't own a ref on static resources, only networks added to the device
   /// manager).
diff --git a/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp b/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp
index 6a57d36dee..c3398d533d 100644
--- a/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp
+++ b/lib/Backends/NNPI/NNPIMLTraceWrapper.cpp
@@ -15,6 +15,8 @@
 
 #include "NNPIMLTraceWrapper.h"
 #include "DebugMacros.h"
+#include "nnpi_ice_caps_hwtrace.h"
+#include "nnpi_ice_caps_swtrace.h"
 #include "nnpi_inference.h"
 #include <chrono>
 #include <cstring>
@@ -26,8 +28,7 @@
 #include <unordered_map>
 #include <vector>
 
-#define MAX_TRACE_BUFFER_SIZE (1024 * 1024 * 5)
-#define TRACE_READ_BUFFER_SIZE (1024 * 10)
+#define MAX_TRACE_BUFFER_SIZE (1024 * 1024 * 100)
 
 static inline uint64_t secondsToMicroseconds(double seconds) {
   return (uint64_t)(seconds * 1e6f);
@@ -43,166 +44,30 @@ static uint64_t inline getNow() {
       .count();
 }
 
-enum NNPITraceColumnIndex {
-  NNPI_TRACE_PID_IDX = 0,
-  NNPI_TRACE_CPU_IDX = 1,
-  NNPI_TRACE_FLAG_IDX = 2,
-  NNPI_TRACE_TIMESTAMP_IDX = 3,
-  NNPI_TRACE_FUNCTION_IDX = 4,
-  NNPI_TRACE_DETAILS_IDX = 5
-};
-
-class NNPITraceParser {
-public:
-  void parseLine(std::string line, NNPITraceEntry &entry) {
-    size_t idx = 0;
-    std::istringstream linestream(line);
-    do {
-      std::string part;
-      linestream >> part;
-
-      switch (idx) {
-      case NNPI_TRACE_PID_IDX: {
-        entry.processID = getPID(part);
-        break;
-      }
-      case NNPI_TRACE_CPU_IDX: {
-        entry.cpuID = getCPUID(part);
-        break;
-      }
-      case NNPI_TRACE_FLAG_IDX: {
-        getFlags(part, entry.flags_);
-        break;
-      }
-      case NNPI_TRACE_TIMESTAMP_IDX: {
-        entry.deviceUpTime = getOriginTime(part);
-        entry.hostTime = entry.deviceUpTime;
-        break;
-      }
-      case NNPI_TRACE_FUNCTION_IDX: {
-        entry.traceType = getType(part);
-        break;
-      }
-      case NNPI_TRACE_DETAILS_IDX: {
-        // NNPI_TRACE_MARK lines (identified at NNPI_TRACE_FUNCTION_IDX column)
-        // has a sub level function type.
-        if (entry.traceType == NNPI_TRACE_MARK &&
-            part[part.size() - 1] == ':') {
-          entry.traceType = getType(part);
-          break;
-        }
-        // Not NNPI_TRACE_MARK: consider as params.
-      }
-      default: // Params.
-      {
-        addParam(part, entry);
-      }
-      }
-      idx++;
-    } while (linestream);
-  }
-
-protected:
-  uint32_t getPID(std::string part) {
-    std::istringstream partSplitStream(part);
-    std::string pid;
-    while (std::getline(partSplitStream, pid, '-'))
-      ;
-    return std::stoi(pid);
-  }
-
-  uint32_t getCPUID(std::string part) {
-    std::string cpuStr = part.substr(1, part.size() - 2);
-    return std::stoi(cpuStr);
-  }
-
-  uint64_t getOriginTime(std::string part) {
-    double dNumber = std::stod(part.substr(0, part.size() - 1));
-    return secondsToMicroseconds(dNumber);
-  }
-
-  void getFlags(std::string part, char *flags) {
-    if (part.size() != 4) {
-      return;
-    }
-    part.copy(flags, 4);
-  }
-
-  NNPITraceType getType(std::string part) {
-    if (part == "dma:") {
-      return NNPI_TRACE_DMA;
-    } else if (part == "copy:") {
-      return NNPI_TRACE_COPY;
-    } else if (part == "cmdlist:") {
-      return NNPI_TRACE_CMDLIST;
-    } else if (part == "icedrvExecuteNetwork:") {
-      return NNPI_TRACE_NETEXEC;
-    } else if (part == "runtime-subgraph:") {
-      return NNPI_TRACE_SUBGRAPH;
-    } else if (part == "infreq:") {
-      return NNPI_TRACE_INFER;
-    } else if (part == "clock_sync:") {
-      return NNPI_TRACE_CLOCK_SYNC;
-    } else if (part == "tracing_mark_write:") {
-      return NNPI_TRACE_MARK;
-    } else if (part == "vtune_time_sync:") {
-      return NNPI_TARCE_TIME_SYNC;
-    } else if (part == "runtime-infer-request:") {
-      return NNPI_TRACE_RUNTIME_INFER;
-    } else if (part == "icedrvScheduleJob:") {
-      return NNPI_TRACE_ICED_SCHED_JOB;
-    } else if (part == "icedrvCreateNetwork:") {
-      return NNPI_TARCE_ICED_CREAT_NET;
-    } else if (part == "icedrvNetworkResource:") {
-      return NNPI_TARCE_ICED_NET_RES;
-    } else if (part == "icedrvEventGeneration:") {
-      return NNPI_TARCE_ICED_NET_GEN;
-    } else if (part == "user_data:") {
-      return NNPI_TARCE_USER_DATA;
-    }
-    return NNPI_TRACE_OTHER;
-  }
-
-  bool addParam(std::string part, NNPITraceEntry &entry) {
-    std::string name;
-    std::string value;
-    std::istringstream partSplitStream(part);
-    std::getline(partSplitStream, name, '=');
-    std::getline(partSplitStream, value, '=');
-
-    while (value[value.size() - 1] == ',') {
-      value = value.substr(0, value.size() - 2);
-    }
-    entry.params[name] = value;
-    return true;
-  }
-};
-
-#define NNPI_SOFTWARE_EVENTS                                                   \
-  "cmdlist,copy,cpylist_create,icedrvCreateContext,icedrvCreateNetwork,"       \
-  "icedrvDestroyContext,icedrvDestroyNetwork,icedrvEventGeneration,"           \
-  "icedrvExecuteNetwork,icedrvNetworkResource,icedrvScheduleJob,inf_net_"      \
-  "subres,infreq,runtime_sw_events.runtime.infer,runtime_sw_events.runtime."   \
-  "subgraph,user_data"
+static eIceCapsSwTraceEvent swEventTypes[] = {
+    eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_CMDLIST,
+    eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_COPY,
+    eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_CPYLIST_CREATE,
+    eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_ICE_DRV,
+    eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_INFR_SUBRES,
+    eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_INFR_CREATE,
+    eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_INFR_REQ,
+    eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_RUNTIME,
+    eIceCapsSwTraceEvent::ICE_CAPS_SW_EVENT_USER_DATA};
 
 NNPITraceContext::NNPITraceContext(unsigned devID)
-    : traceCtx_(0), devID_(devID), devIDSet_(false),
-      events_(NNPI_SOFTWARE_EVENTS) {}
+    : capsSession_(0), devID_(devID), devIDSet_(false) {}
 
 NNPITraceContext::~NNPITraceContext() { destroyInternalContext(); }
 
-bool NNPITraceContext::startCapture(NNPIDeviceContext deviceContext) {
-  if (!createInternalContext()) {
+bool NNPITraceContext::startCapture(NNPIDeviceContext deviceContext,
+                                    bool swTracess, bool hwTraces) {
+  if (!createInternalContext(swTracess, hwTraces)) {
     LOG(WARNING) << "nnpi_trace: Failed to create trace device context.";
     return false;
   }
-  nnpimlTraceOptions traceOptions;
-  std::memset(&traceOptions, 0, sizeof(nnpimlTraceOptions));
-  traceOptions.max_bytes = MAX_TRACE_BUFFER_SIZE;
-  traceOptions.max_bytes_valid = true;
 
-  nnpimlStatus mlStatus =
-      nnpimlTraceStart(traceCtx_, devID_, &traceOptions, events_.c_str());
+  nnpimlStatus mlStatus = nnpiIceCapsStart(capsSession_);
   if (mlStatus != NNPIML_SUCCESS) {
     LOG(WARNING) << "nnpi_trace: Failed to start trace, err=" << mlStatus;
     return false;
@@ -215,88 +80,141 @@ bool NNPITraceContext::startCapture(NNPIDeviceContext deviceContext) {
 }
 
 bool NNPITraceContext::stopCapture(NNPIDeviceContext deviceContext) const {
-  uint32_t outBytes, discardEvents;
   LOG_NNPI_INF_IF_ERROR(
       nnpiDeviceContextTraceUserData(deviceContext, "EN", getNow()),
       "Failed to inject trace timestamp - device trace may not be "
       "synchronized");
-  nnpimlStatus mlStatus =
-      nnpimlTraceStop(traceCtx_, devID_, &outBytes, &discardEvents);
+  nnpimlStatus mlStatus = nnpiIceCapsStop(capsSession_);
   if (mlStatus != NNPIML_SUCCESS) {
     return false;
   }
   return true;
 }
 
-bool NNPITraceContext::readTraceOutput(std::stringstream &inputStream) {
-  char readData[TRACE_READ_BUFFER_SIZE + 1];
-  uint32_t size = TRACE_READ_BUFFER_SIZE;
-  uint32_t actualSize = size;
-  // Read trace bytes into stream.
-  uint32_t offset = 0;
-  while (actualSize >= size) {
-    nnpimlStatus mlStatus =
-        nnpimlTraceRead(traceCtx_, devID_, offset, size, readData, &actualSize);
-    inputStream.write(readData, actualSize);
-    offset += actualSize;
-    if (mlStatus != NNPIML_SUCCESS) {
-      // Failed to read trace.
-      return false;
-    }
+bool NNPITraceContext::readTraceOutput() {
+  nnpimlStatus mlStatus = nnpiIceCapsRead(capsSession_);
+  if (mlStatus != NNPIML_SUCCESS) {
+    // Failed to read trace.
+    LOG(WARNING) << "nnpi_trace: Failed to read traces from device, err="
+                 << mlStatus;
+    return false;
+  }
+  mlStatus = nnpiIceCapsParse(capsSession_);
+  if (mlStatus != NNPIML_SUCCESS) {
+    // Failed to read trace.
+    LOG(WARNING) << "nnpi_trace: Failed to parse traces on device, err="
+                 << mlStatus;
+    return false;
   }
-  return true;
-}
-
-bool NNPITraceContext::load() {
-  entries_.clear();
-  std::stringstream inputStream;
 
-  if (!readTraceOutput(inputStream)) {
-    destroyInternalContext();
+  mlStatus = nnpiIceCapsProcess(capsSession_);
+  if (mlStatus != NNPIML_SUCCESS) {
+    // Failed to read trace.
+    LOG(WARNING) << "nnpi_trace: Failed to process traces on device, err="
+                 << mlStatus;
+    return false;
+  }
+  size_t entryCount = 0;
+  mlStatus = nnpiIceCapsGetEntriesCount(capsSession_, &entryCount);
+  if (mlStatus != NNPIML_SUCCESS) {
+    // Failed to read trace.
+    LOG(WARNING) << "nnpi_trace: Failed to read traces count, err=" << mlStatus;
     return false;
   }
-  destroyInternalContext();
 
-  // Handle stream.
-  std::string line;
-  NNPITraceParser parser;
   bool started = false;
   uint64_t glowStart = 0;
   uint64_t glowEnd = 0;
-  uint64_t nnpiStart = 0;
-  uint64_t nnpiEnd = 0;
+  uint64_t deviceStart = 0;
+  uint64_t deviceEnd = 0;
+  uint64_t hostStart = 0;
+  uint64_t hostEnd = 0;
+  for (size_t i = 0; i < entryCount; i++) {
+    IceCapsEntry entry;
+    NNPITraceEntry traceEntry;
+    std::stringstream entryStrRep;
+    mlStatus = nnpiIceCapsGetEntry(capsSession_, i, &entry);
+    if (mlStatus != NNPIML_SUCCESS) {
+      // Failed to read trace.
+      LOG(WARNING) << "nnpi_trace: Failed to read trace entries, err="
+                   << mlStatus;
+      return false;
+    }
 
-  while (std::getline(inputStream, line)) {
-    if (line.find("#", 0) == 0) {
-      // Skip comment.
-      continue;
+    // Set parameters.
+    traceEntry.params["name"] = entry.event_name;
+    traceEntry.params["state"] = entry.state;
+    traceEntry.hostTime = entry.timestamp;
+    traceEntry.engineTime = entry.engine_timestamp;
+    traceEntry.params["engine"] =
+        ((entry.engine == eIceCapsEngine::ICE_CAPS_SW_TRACE)
+             ? std::string("SW")
+             : std::string("HW"));
+    traceEntry.params["event_key"] = std::to_string(entry.event_key);
+    traceEntry.params["device_id"] = std::to_string(entry.device_id);
+    traceEntry.params["context_id"] = std::to_string(entry.context_id);
+    traceEntry.params["network_id"] = std::to_string(entry.network_id);
+    traceEntry.params["infer_id"] = std::to_string(entry.infer_id);
+    traceEntry.params["ice_id"] = std::to_string(entry.ice_id);
+    traceEntry.params["core_id"] = std::to_string(entry.core_id);
+    traceEntry.params["network_name"] = entry.network_name;
+    traceEntry.params["kernel_name"] = entry.kernel_name;
+    traceEntry.params["opcode"] = entry.opcode;
+
+    std::stringstream params;
+    for (size_t p = 0; p < entry.params_count; p++) {
+      IceCapsParam param;
+      mlStatus = nnpiIceCapsGetEntryParam(capsSession_, i, p, &param);
+      if (mlStatus != NNPIML_SUCCESS) {
+        // Failed to read params.
+        LOG(WARNING) << "nnpi_trace: Failed to read trace entry params, err="
+                     << mlStatus;
+        break;
+      }
+      traceEntry.params[param.name] = param.value;
+      params << param.name << ":" << param.value << ", ";
     }
-    NNPITraceEntry entry;
-    parser.parseLine(line, entry);
-    if (entry.traceType == NNPI_TARCE_USER_DATA) {
-      if (!started && entry.params["key"] == "BG") {
-        auto p = entry.params["user_data"];
-        glowStart = std::stol(entry.params["user_data"]);
-        nnpiStart = entry.deviceUpTime;
+
+    if (entry.state == "created" || entry.state == "queued" ||
+        entry.state == "req" || entry.state == "add") {
+      entry.state = "q";
+    } else if (entry.state == "executed" || entry.state == "cbs" ||
+               entry.state == "start") {
+      entry.state = "s";
+    } else if (entry.state == "completed" || entry.state == "cbc") {
+      entry.state = "c";
+    }
+    traceEntry.params["state"] = entry.state;
+    entries_.push_back(traceEntry);
+    if (entry.event_name == "user_data" &&
+        traceEntry.params.count("user_data") > 0 &&
+        traceEntry.params.count("key") > 0) {
+      if (!started && traceEntry.params["key"] == "BG") {
+        glowStart = std::stol(traceEntry.params["user_data"]);
+        deviceStart = entry.engine_timestamp;
+        hostStart = entry.timestamp;
         started = true;
-      } else if (entry.params["key"] == "EN") {
-        auto p = entry.params["user_data"];
-        glowEnd = std::stol(entry.params["user_data"]);
-        nnpiEnd = entry.deviceUpTime;
-        started = false;
+      } else if (traceEntry.params["key"] == "EN") {
+        glowEnd = std::stol(traceEntry.params["user_data"]);
+        deviceEnd = entry.engine_timestamp;
+        hostEnd = entry.timestamp;
       }
     }
-    if (started) {
-      entries_.push_back(entry);
-    }
   }
-  if (glowStart > 0 && glowEnd > 0 && nnpiStart > 0 && nnpiEnd > 0) {
-    // Calculate host time function.
-    double m = (double)(glowEnd - glowStart) / (double)(nnpiEnd - nnpiStart);
-    int64_t C = glowStart - m * nnpiStart;
+  // Sync clocks:
+  if (glowStart > 0 && glowEnd > 0 && hostStart > 0 && hostEnd > 0 &&
+      deviceStart > 0 && deviceEnd > 0) {
+    // Calculate host time function for host time.
+    double hostM =
+        (double)(glowEnd - glowStart) / (double)(hostEnd - hostStart);
+    double deviceM =
+        (double)(glowEnd - glowStart) / (double)(deviceEnd - deviceStart);
+    int64_t hostC = glowStart - hostM * hostStart;
+    int64_t deviceC = glowStart - deviceM * deviceStart;
     // Update host time.
     for (NNPITraceEntry &entry : entries_) {
-      entry.hostTime = entry.deviceUpTime * m + C;
+      entry.hostTime = entry.hostTime * hostM + hostC;
+      entry.engineTime = entry.engineTime * deviceM + deviceC;
     }
   } else {
     LOG(WARNING) << "Failed to synchronize glow and nnpi device traces.";
@@ -304,38 +222,100 @@ bool NNPITraceContext::load() {
   return true;
 }
 
+bool NNPITraceContext::load() {
+  entries_.clear();
+  std::stringstream inputStream;
+
+  if (!readTraceOutput()) {
+    destroyInternalContext();
+    return false;
+  }
+  destroyInternalContext();
+  return true;
+}
+
 bool NNPITraceContext::destroyInternalContext() {
-  if (traceCtx_ == 0) {
+  if (capsSession_ == 0) {
     return false;
   }
-  nnpimlStatus mlStatus = nnpimlDestroyTraceContext(traceCtx_);
-  traceCtx_ = 0;
+  nnpimlStatus mlStatus = nnpiIceCapsCloseSession(capsSession_);
+  capsSession_ = 0;
   if (mlStatus != NNPIML_SUCCESS) {
-    LOG(WARNING) << "nnpi_trace: Failed to stop device trace, err=" << mlStatus;
-    traceCtx_ = 0;
+    LOG(WARNING) << "nnpi_trace: Failed to stop device trace session, err="
+                 << mlStatus;
+    capsSession_ = 0;
     return false;
   }
 
   return true;
 }
 
-bool NNPITraceContext::createInternalContext() {
-  if (traceCtx_ != 0) {
+bool NNPITraceContext::createInternalContext(bool swTraces, bool hwTraces) {
+  if (capsSession_ != 0) {
     return false;
   }
-  devMask_ = 1UL << devID_;
-  nnpimlStatus mlStatus =
-      nnpimlCreateTraceContext(devMask_, &traceCtx_, &devMask_);
+  nnpimlStatus mlStatus = nnpiIceCapsOpenSession(&capsSession_);
   if (mlStatus != NNPIML_SUCCESS) {
-    LOG(WARNING) << "nnpi_trace: Failed to start device trace, err="
-                 << mlStatus;
-    traceCtx_ = 0;
+    LOG(WARNING) << "nnpi_trace: Failed to trace session, err=" << mlStatus;
+    capsSession_ = 0;
     return false;
   }
-  if (!(1UL << devID_ & devMask_)) {
-    destroyInternalContext();
-    LOG(WARNING) << "nnpi_trace: Cloud not open trace for device " << devID_;
-    return false;
+  devMask_ = 1UL << devID_;
+  if (swTraces) {
+    size_t swEventsCount = sizeof(swEventTypes) / sizeof(swEventTypes[0]);
+    size_t idx = 0;
+    IceCapsSwTraceConfig traceConfigs[1 + swEventsCount];
+    traceConfigs[idx].traceOptions.config_type =
+        eIceCapsSwTraceConfigType::ICE_CAPS_SWTRACE_OPTIONS;
+    traceConfigs[idx].traceOptions.device_mask = devMask_;
+    traceConfigs[idx].traceOptions.max_bytes = MAX_TRACE_BUFFER_SIZE;
+    idx++;
+    for (size_t i = 0; i < swEventsCount; i++) {
+      traceConfigs[idx].traceEvent.config_type =
+          eIceCapsSwTraceConfigType::ICE_CAPS_SWTRACE_EVENT;
+      traceConfigs[idx].traceEvent.event = swEventTypes[i];
+      idx++;
+    }
+
+    IceCapsConfig iceSWCapsConfig;
+    iceSWCapsConfig.engine = eIceCapsEngine::ICE_CAPS_SW_TRACE;
+    iceSWCapsConfig.size = sizeof(traceConfigs);
+    iceSWCapsConfig.buffer = traceConfigs;
+    mlStatus = nnpiIceCapsPrepare(capsSession_, &iceSWCapsConfig);
+    if (mlStatus != NNPIML_SUCCESS) {
+      LOG(WARNING)
+          << "nnpi_trace: Failed to set device Software trace options, err="
+          << mlStatus;
+      destroyInternalContext();
+      return false;
+    }
+  }
+  if (hwTraces) {
+    IceCapsHwTraceConfig traceConfigs[2];
+    traceConfigs[0].traceOptions.config_type =
+        eIceCapsHwTraceConfigType::ICE_CAPS_HWTRACE_OPTIONS;
+    traceConfigs[0].traceOptions.device_mask = devMask_;
+    traceConfigs[0].traceOptions.max_trace_size = MAX_TRACE_BUFFER_SIZE;
+    traceConfigs[1].iceFilter.config_type =
+        eIceCapsHwTraceConfigType::ICE_CAPS_HWTRACE_FILTER;
+    traceConfigs[1].iceFilter.ice_mask = 0xFFF; // All ICEs.
+    traceConfigs[1].iceFilter.filter_type =
+        eIceCapsHwTraceFilter::ICE_CAPS_HWTRACE_CAPTURE_ALL;
+
+    IceCapsConfig iceHWCapsConfig;
+    iceHWCapsConfig.engine = eIceCapsEngine::ICE_CAPS_HW_TRACE;
+    iceHWCapsConfig.size = sizeof(traceConfigs);
+    iceHWCapsConfig.buffer = traceConfigs;
+
+    mlStatus = nnpiIceCapsPrepare(capsSession_, &iceHWCapsConfig);
+    if (mlStatus != NNPIML_SUCCESS) {
+      LOG(WARNING)
+          << "nnpi_trace: Failed to set device Hardware trace options, err="
+          << mlStatus;
+      destroyInternalContext();
+      return false;
+    }
   }
+
   return true;
 }
\ No newline at end of file
diff --git a/lib/Backends/NNPI/NNPIMLTraceWrapper.h b/lib/Backends/NNPI/NNPIMLTraceWrapper.h
index 3d113c5304..42509163b2 100644
--- a/lib/Backends/NNPI/NNPIMLTraceWrapper.h
+++ b/lib/Backends/NNPI/NNPIMLTraceWrapper.h
@@ -17,37 +17,13 @@
 #define NNPI_NNPITRACING_ML_WRAPPER_H
 
 #include <map>
+#include <nnpi_ice_caps.h>
 #include <nnpi_inference.h>
-#include <nnpiml.h>
 #include <vector>
 
-enum NNPITraceType {
-  NNPI_TRACE_UNKNOWN = 0x0000,
-  NNPI_TRACE_DMA = 0x0001,
-  NNPI_TRACE_INFER = 0x0002,
-  NNPI_TRACE_COPY = 0x0004,
-  NNPI_TRACE_MARK = 0x0008,
-  NNPI_TRACE_CLOCK_SYNC = 0x0010,
-  NNPI_TRACE_CMDLIST = 0x0020,
-  NNPI_TRACE_NETEXEC = 0x0040,
-  NNPI_TRACE_SUBGRAPH = 0x0080,
-  NNPI_TARCE_TIME_SYNC = 0x0100,
-  NNPI_TRACE_RUNTIME_INFER = 0x0200,
-  NNPI_TRACE_ICED_SCHED_JOB = 0x0400,
-  NNPI_TARCE_ICED_CREAT_NET = 0x0800,
-  NNPI_TARCE_ICED_NET_RES = 0x1000,
-  NNPI_TARCE_ICED_NET_GEN = 0x1001,
-  NNPI_TARCE_USER_DATA = 0x4000,
-  NNPI_TRACE_OTHER = 0x8000
-};
-
 struct NNPITraceEntry {
-  uint64_t deviceUpTime{0};
+  uint64_t engineTime{0};
   uint64_t hostTime{0};
-  NNPITraceType traceType{NNPI_TRACE_UNKNOWN};
-  uint32_t processID{0};
-  uint32_t cpuID{0};
-  char flags_[4];
   std::map<std::string, std::string> params;
 };
 
@@ -57,7 +33,8 @@ class NNPITraceContext {
   NNPITraceContext(unsigned devID);
   virtual ~NNPITraceContext();
   /// Start capturing traces from the HW device.
-  bool startCapture(NNPIDeviceContext deviceContext);
+  bool startCapture(NNPIDeviceContext deviceContext, bool swTraces,
+                    bool hwTraces);
   /// Start capturing.
   bool stopCapture(NNPIDeviceContext deviceContext) const;
   /// Load traces (valid only after stopCapture()).
@@ -76,14 +53,13 @@ class NNPITraceContext {
 
 private:
   bool destroyInternalContext();
-  bool createInternalContext();
-  bool readTraceOutput(std::stringstream &inputStream);
+  bool createInternalContext(bool swTraces, bool hwTraces);
+  bool readTraceOutput();
 
-  nnpimlTraceContext traceCtx_{0};
+  IceCaps_t capsSession_{0};
   uint64_t devMask_{0};
   unsigned devID_{0};
   bool devIDSet_{false};
-  std::string events_;
   std::vector<NNPITraceEntry> entries_;
 };
 
diff --git a/lib/Backends/NNPI/NNPIOptions.cpp b/lib/Backends/NNPI/NNPIOptions.cpp
index ebf4fb4da9..0d7e690d0f 100644
--- a/lib/Backends/NNPI/NNPIOptions.cpp
+++ b/lib/Backends/NNPI/NNPIOptions.cpp
@@ -79,6 +79,10 @@ template <> unsigned NNPIOptions::getStringAsType<unsigned>(std::string sVal) {
   return 0;
 }
 
+template <> float NNPIOptions::getStringAsType<float>(std::string sVal) {
+  return std::strtof(sVal.c_str(), nullptr);
+}
+
 std::string NNPIOptions::dumpStatus() {
   std::stringstream desc;
   desc << "\nNNPI " << getOptionsName().data() << " variables\n";
diff --git a/lib/Backends/NNPI/NNPIOptions.h b/lib/Backends/NNPI/NNPIOptions.h
index d9c30ddaba..a1c7206e53 100644
--- a/lib/Backends/NNPI/NNPIOptions.h
+++ b/lib/Backends/NNPI/NNPIOptions.h
@@ -80,15 +80,17 @@ class NNPIOptions {
   llvm::StringMap<std::string> supportedOptions_;
 };
 
-/// Explicit forward decleration of template type.
+/// Explicit forward declaration of template type.
 template <> bool NNPIOptions::getStringAsType<bool>(std::string sVal);
-/// Explicit forward decleration of template type.
+/// Explicit forward declaration of template type.
 template <>
 std::string NNPIOptions::getStringAsType<std::string>(std::string sVal);
-/// Explicit forward decleration of template type.
+/// Explicit forward declaration of template type.
 template <> int NNPIOptions::getStringAsType<int>(std::string sVal);
-/// Explicit forward decleration of template type.
+/// Explicit forward declaration of template type.
 template <> unsigned NNPIOptions::getStringAsType<unsigned>(std::string sVal);
+/// Explicit forward declaration of template type.
+template <> float NNPIOptions::getStringAsType<float>(std::string sVal);
 
 #define DECLARE_NNPI_OPTION(VAR_NAME, VAR_TYPE, OPT_NAME, OPT_DESC, OPT_ENV,   \
                             OPT_DEFAULT)                                       \
@@ -257,6 +259,52 @@ class NNPICompilationOptions : public NNPIOptions {
                       "Override the amount of worker threads allocated for the "
                       "network on the device.",
                       "NNPI_NUM_WORKERS", "2");
+  /// Power & Performance hints. See more details at:
+  /// https://github.com/IntelAI/nnpi-sw/blob/master/include/nnpi_inference_types.h
+  DECLARE_NNPI_OPTION(ringPrio, float, "RingPrio",
+                      "Set the ring frequency priority.", "NNPI_RING_PRIO",
+                      "0.f");
+  DECLARE_NNPI_OPTION(iceBOPrio0, float, "IceBOPrio0",
+                      "Set ICE-BO 0 frequency priority.", "NNPI_ICEBO_PRIO0",
+                      "0.f");
+  DECLARE_NNPI_OPTION(iceBOPrio1, float, "IceBOPrio1",
+                      "Set ICE-BO 1 frequency priority.", "NNPI_ICEBO_PRIO1",
+                      "0.f");
+  DECLARE_NNPI_OPTION(iceBOPrio2, float, "IceBOPrio2",
+                      "Set ICE-BO 2 frequency priority.", "NNPI_ICEBO_PRIO2",
+                      "0.f");
+  DECLARE_NNPI_OPTION(iceBOPrio3, float, "IceBOPrio3",
+                      "Set ICE-BO 3 frequency priority.", "NNPI_ICEBO_PRIO3",
+                      "0.f");
+  DECLARE_NNPI_OPTION(iceBOPrio4, float, "IceBOPrio4",
+                      "Set ICE-BO 4 frequency priority.", "NNPI_ICEBO_PRIO4",
+                      "0.f");
+  DECLARE_NNPI_OPTION(iceBOPrio5, float, "IceBOPrio5",
+                      "Set ICE-BO 5 frequency priority.", "NNPI_ICEBO_PRIO5",
+                      "0.f");
+  DECLARE_NNPI_OPTION(iaPrio0, float, "IAPrio0", "Set IA 0 frequency priority.",
+                      "NNPI_IA_PRIO0", "0.f");
+  DECLARE_NNPI_OPTION(iaPrio1, float, "IAPrio1", "Set IA 1 frequency priority.",
+                      "NNPI_IA_PRIO1", "0.f");
+  DECLARE_NNPI_OPTION(ddrBandwidth, float, "DDRBandwidth",
+                      "Set an estimated DDR bandwidth in GB/s.", "NNPI_DDR_BW",
+                      "0.f");
+  /// Disable SLS on IA.
+  DECLARE_NNPI_OPTION(disableSLSOnIA, bool, "DisableSLSOnIA",
+                      "Disable SLS exectuion on IA (SLS will execute on ICE).",
+                      "NNPI_DISABLE_SLS_ON_IA", "1");
+  /// Enable lightweight compilation.
+  DECLARE_NNPI_OPTION(lightCompilation, bool, "LightCompilation",
+                      "Enable light compilation (only for gathering metadata).",
+                      "NNPI_LIGHT_COMPILATION", "0");
+  /// Dump compiler DOT files.
+  DECLARE_NNPI_OPTION(dumpDotFiles, bool, "DumpDotFiles",
+                      "Dump Dot files of the network during compilation.",
+                      "NNPI_DUMP_DOT", "0");
+  /// Dump compilation info.
+  DECLARE_NNPI_OPTION(dumpCompilationInfo, bool, "dumpCompilationInfo",
+                      "Dump the compilation info in text form.",
+                      "NNPI_DUMP_COMP_INFO", "0");
 
   NNPICompilationOptions(const BackendSpecificOptions &parameters) {
     INIT_NNPI_OPTIONS(useIceT, parameters);
@@ -274,6 +322,20 @@ class NNPICompilationOptions : public NNPIOptions {
     INIT_NNPI_OPTIONS(disableConstFolding, parameters);
     INIT_NNPI_OPTIONS(numWorkers, parameters);
     setLogLevel(this->compilationLogLevel);
+    INIT_NNPI_OPTIONS(ringPrio, parameters);
+    INIT_NNPI_OPTIONS(iceBOPrio0, parameters);
+    INIT_NNPI_OPTIONS(iceBOPrio1, parameters);
+    INIT_NNPI_OPTIONS(iceBOPrio2, parameters);
+    INIT_NNPI_OPTIONS(iceBOPrio3, parameters);
+    INIT_NNPI_OPTIONS(iceBOPrio4, parameters);
+    INIT_NNPI_OPTIONS(iceBOPrio5, parameters);
+    INIT_NNPI_OPTIONS(iaPrio0, parameters);
+    INIT_NNPI_OPTIONS(iaPrio1, parameters);
+    INIT_NNPI_OPTIONS(ddrBandwidth, parameters);
+    INIT_NNPI_OPTIONS(disableSLSOnIA, parameters);
+    INIT_NNPI_OPTIONS(lightCompilation, parameters);
+    INIT_NNPI_OPTIONS(dumpDotFiles, parameters);
+    INIT_NNPI_OPTIONS(dumpCompilationInfo, parameters);
   }
 
   virtual llvm::StringRef getOptionsName() const override {
@@ -315,12 +377,11 @@ class NNPIDeviceOptions : public NNPIOptions {
   DECLARE_NNPI_OPTION(deviceId, int, "DeviceID",
                       "Override the target device ID used to run (0,1,...).",
                       "NNPI_DEVICE_ID", "-1");
-  /// Setting this variable will enabled device tracing (host2device,
-  /// device2host copy infer etc.).
-  DECLARE_NNPI_OPTION(
-      enabledDeviceTracing, bool, "DeviceTracing",
-      "Enabled device tracing (host2device, device2host copy infer etc.).",
-      "NNPI_DEVICE_TRACING", "0");
+  /// Enable Hardware Trace.
+  DECLARE_NNPI_OPTION(hardwareTraces, bool, "hardwareTraces",
+                      "Enable hardware traces when device traces are started "
+                      "(default is disabled).",
+                      "NNPI_HW_TRACES", "0");
   /// Override the max NNPI device memory.
   DECLARE_NNPI_OPTION(
       deviceMemory, unsigned, "DeviceMemory",
@@ -362,7 +423,7 @@ class NNPIDeviceOptions : public NNPIOptions {
     INIT_NNPI_OPTIONS(inferOnDevice, parameters);
     INIT_NNPI_OPTIONS(showVars, parameters);
     INIT_NNPI_OPTIONS(deviceId, parameters);
-    INIT_NNPI_OPTIONS(enabledDeviceTracing, parameters);
+    INIT_NNPI_OPTIONS(hardwareTraces, parameters);
     INIT_NNPI_OPTIONS(deviceMemory, parameters);
     INIT_NNPI_OPTIONS(enabledCommandLists, parameters);
     INIT_NNPI_OPTIONS(dumpIOtoFiles, parameters);
diff --git a/lib/Backends/NNPI/NNPITracing.cpp b/lib/Backends/NNPI/NNPITracing.cpp
index 40281fd2f0..9d07e8e25c 100644
--- a/lib/Backends/NNPI/NNPITracing.cpp
+++ b/lib/Backends/NNPI/NNPITracing.cpp
@@ -21,148 +21,139 @@
 
 using namespace glow;
 
-NNPIDeviceTracing::NNPIDeviceTracing(unsigned deviceID) {
-  traceCtx_ = glow::make_unique<NNPITraceContext>(deviceID);
+std::map<std::string, int> NNPIDeviceTracing::activeAffinities_ = {};
+
+NNPIDeviceTracing::NNPIDeviceTracing(unsigned deviceId) : deviceId_(deviceId) {
+  traceCtx_ = glow::make_unique<NNPITraceContext>(deviceId_);
   deviceInfo_ =
-      std::string("[Device #") + std::to_string(deviceID) + std::string("] ");
+      std::string("[Device #") + std::to_string(deviceId_) + std::string("] ");
 }
 
 bool NNPIDeviceTracing::start(TraceContext *traceContext,
-                              NNPIDeviceContext deviceContext) {
+                              NNPIDeviceContext deviceContext, bool swTraces,
+                              bool hwTraces) {
   if (!traceContext ||
       !traceContext->shouldLog(TraceEvent::TraceLevel::OPERATOR)) {
     return false;
   }
   if (started_.test_and_set()) {
-    ASSERT_WITH_MSG(glowTraceCtx_ != traceContext,
-                    "Trying to start tracing for an already started context.");
     // Trace already started.
     return false;
   }
-  glowTraceCtx_ = traceContext;
-  if (!traceCtx_->startCapture(deviceContext)) {
-    LOG(WARNING) << "Failed to start trace capture";
+  bool isFirstToStart = NNPIDeviceTracing::isFirstToChangeCaptureStart(true);
+  if (!traceCtx_->startCapture(deviceContext, swTraces, hwTraces)) {
+    LOG(WARNING) << "Failed to start trace capture for device " << deviceId_
+                 << " is first = " << (isFirstToStart);
     return false;
   }
   return true;
 }
 
 std::string NNPIDeviceTracing::getEntryName(NNPITraceEntry &entry) {
-  std::stringstream name;
-  name << deviceInfo_;
-  switch (entry.traceType) {
-  case NNPI_TRACE_UNKNOWN:
-    name << "UnknownTrace";
-    break;
-  case NNPI_TRACE_DMA:
-    name << "DMA";
-    break;
-  case NNPI_TRACE_INFER:
-    name << "Infer";
-    break;
-  case NNPI_TRACE_COPY:
-    name << "Copy";
-    break;
-  case NNPI_TRACE_MARK:
-    name << "MarkTrace";
-    break;
-  case NNPI_TRACE_CLOCK_SYNC:
-    name << "ClockSync";
-    break;
-  case NNPI_TRACE_CMDLIST:
-    name << "CommandList";
-    break;
-  case NNPI_TRACE_NETEXEC:
-    name << "NetExecute";
-    break;
-  case NNPI_TRACE_SUBGRAPH:
-    name << "SubGraph";
-    break;
-  case NNPI_TRACE_RUNTIME_INFER:
-    name << "RunTimeInf";
-    break;
-  case NNPI_TRACE_ICED_SCHED_JOB:
-    name << "DSchedJob";
-    break;
-  case NNPI_TARCE_ICED_CREAT_NET:
-    name << "DCreateNet";
-    break;
-  case NNPI_TARCE_ICED_NET_RES:
-    name << "DNetRes";
-    break;
-  case NNPI_TARCE_ICED_NET_GEN:
-    name << "DNetGen";
-    break;
-  default:
-    name << "Othertrace";
+  std::string entryName = entry.params["name"];
+  if (entryName.rfind("icedrv", 0) == 0) {
+    entryName = entryName.substr(strlen("icedrv"));
+  } else if (entryName.rfind("runtime-", 0) == 0) {
+    entryName = entryName.substr(strlen("runtime-"));
+  }
+  if (entry.params.count("command") > 0) {
+    entryName = entry.params["command"];
   }
+
+  std::stringstream name;
+
+  name << entryName;
   if (entry.params.count("isC2H") > 0) {
     if (entry.params["isC2H"] == "1") {
-      name << "-Card2Host";
+      name << " Card2Host";
     } else {
-      name << "-Host2Card";
+      name << " Host2Card";
     }
   }
   auto params = entry.params;
-  if (entry.params.count("iceId") > 0) {
-    name << "-ICE_" << entry.params["iceId"];
+  if (entry.params.count("ice_id") > 0) {
+    name << " ICE_" << entry.params["ice_id"];
   }
-  if (entry.params.count("netID") > 0) {
-    name << "-NET_" << entry.params["netID"];
+  if (entry.params.count("network_id") > 0) {
+    name << " Net " << entry.params["network_id"];
   }
-  if (entry.params.count("reqID") > 0) {
-    name << "REQ_" << entry.params["reqID"];
+  if (entry.params.count("network_name") > 0 &&
+      entry.params["network_name"] != "NA") {
+    name << " NetName " << entry.params["network_name"];
   }
-  if (entry.params.count("ctxID") > 0) {
-    name << "-CTX_" << entry.params["ctxID"];
+  if (entry.params.count("context_id") > 0) {
+    name << " CTX 0x" << std::hex << std::stol(entry.params["context_id"]);
   }
   if (entry.params.count("subNetId") > 0) {
-    name << "-SUBNET_" << entry.params["subNetId"];
+    name << " Subnet " << entry.params["subNetId"];
   }
-  if (entry.params.count("inferID") > 0) {
-    name << "-INFR_" << entry.params["inferID"];
+  if (entry.params.count("infer_id") > 0) {
+    name << " InfID " << entry.params["infer_id"];
   }
   if (entry.params.count("subGraphID") > 0) {
-    name << "-SUBGRAPH_" << entry.params["subGraphID"];
+    name << " Subgraph " << entry.params["subGraphID"];
   }
   if (entry.params.count("agent") > 0) {
-    name << "-AGENT_" << entry.params["agent"];
+    name << " Agent " << entry.params["agent"];
   }
-  if (entry.params.count("copyID") > 0) {
-    name << "-CPID_" << entry.params["copyID"];
+  if (entry.params.count("kernel_name") > 0 &&
+      entry.params["kernel_name"] != "NA") {
+    name << " Krnl " << entry.params["kernel_name"];
   }
-  if (entry.params.count("size") > 0) {
-    name << "-SIZE_" << entry.params["size"];
+  if (entry.params.count("userHandle") > 0) {
+    name << " 0x" << std::hex << std::stol(entry.params["userHandle"]);
   }
+
   return name.str();
 }
 
-bool NNPIDeviceTracing::addTrace(NNPITraceEntry &entry) {
+int NNPIDeviceTracing::getAffinityID(NNPITraceEntry &entry, std::string name,
+                                     unsigned deviceId,
+                                     TraceContext *traceContext) {
+  // Need to be guarded when mutiple devices are active.
+  static std::mutex affinityMutext;
+  std::lock_guard<std::mutex> lk(affinityMutext);
+
+  // Start affinity at some high number to avoid collisions.
+  int affinId = 10000;
+  std::string iceId = entry.params["ice_id"];
+  std::string contextId = entry.params["context_id"];
+  std::stringstream affinityNameStuct;
+
+  affinityNameStuct << "Device #" << deviceId << " ICE #" << iceId;
+
+  // Add additional info to title.
+  if (entry.params["opcode"] != "NA") {
+    affinityNameStuct << " opcode " << entry.params["opcode"];
+  }
+  // Use the op name.
+  affinityNameStuct << " " << name.substr(0, name.find(' '));
+  if (entry.params["state"] == "q") {
+    affinityNameStuct << " Queue";
+  }
+
+  if (activeAffinities_.count(affinityNameStuct.str()) <= 0) {
+    affinId += activeAffinities_.size();
+    activeAffinities_[affinityNameStuct.str()] = affinId;
+    traceContext->setThreadName(affinId, affinityNameStuct.str());
+  } else {
+    affinId = activeAffinities_[affinityNameStuct.str()];
+  }
+
+  return affinId;
+}
+
+bool NNPIDeviceTracing::addTrace(
+    NNPITraceEntry &entry, std::map<std::string, NNPITraceEntry> &inflight,
+    TraceContext *traceContext) {
+  std::stringstream entryLog;
+  for (auto const &paramEntry : entry.params) {
+    entryLog << paramEntry.first << ":" << paramEntry.second << " ,";
+  }
   // Filter traces.
-  switch (entry.traceType) {
-  case NNPI_TRACE_INFER:
-  case NNPI_TRACE_COPY:
-  case NNPI_TRACE_CMDLIST:
-  case NNPI_TRACE_NETEXEC:
-  case NNPI_TRACE_SUBGRAPH:
-  case NNPI_TRACE_RUNTIME_INFER:
-  case NNPI_TRACE_ICED_SCHED_JOB:
-  case NNPI_TARCE_ICED_CREAT_NET:
-  case NNPI_TARCE_ICED_NET_RES:
-  case NNPI_TARCE_ICED_NET_GEN:
-    break;
-  case NNPI_TRACE_UNKNOWN:
-  case NNPI_TRACE_DMA:
-  case NNPI_TRACE_MARK:
-  case NNPI_TRACE_CLOCK_SYNC:
-  case NNPI_TARCE_TIME_SYNC:
-  case NNPI_TARCE_USER_DATA:
-    return false;
-  default:
-    LOG(WARNING) << "Trying to add unsupported trace type:" << entry.traceType;
+  if (entry.params["state"] == "NA") {
     return false;
   }
-
   std::string name = getEntryName(entry);
 
   if (entry.params.count("state") <= 0) {
@@ -170,29 +161,39 @@ bool NNPIDeviceTracing::addTrace(NNPITraceEntry &entry) {
   }
   std::string state = entry.params["state"];
 
-  if (state == "q" || state == "queued") {
+  // Calculate affinity - use the trace thread id to make sections in the
+  // representation.
+  int affinId =
+      NNPIDeviceTracing::getAffinityID(entry, name, deviceId_, traceContext);
+  if (affinId <= 0) {
+    LOG(WARNING) << "Found unexpected affinity ID " << affinId << " for "
+                 << name;
+  }
+  // Add events.
+  if (state == "q") {
     name += "-Queue";
-    glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR,
-                                 TraceEvent::InstantType, entry.hostTime, {});
-  } else if (state == "s" || state == "cbs" || state == "executed") {
-    glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR,
-                                 TraceEvent::BeginType, entry.hostTime, {});
-  } else if (state == "c" || state == "cbc" || state == "completed") {
-    glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR,
-                                 TraceEvent::EndType, entry.hostTime, {});
-  } else if (state == "cbs") {
-    glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR,
-                                 TraceEvent::BeginType, entry.hostTime, {});
-  } else if (state == "cbc") {
-    glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR,
-                                 TraceEvent::EndType, entry.hostTime, {});
-  } else if (state == "cbnwc") {
-    glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR,
-                                 TraceEvent::InstantType, entry.hostTime, {});
-  } else if (state == "req") {
-    name += "-Req";
-    glowTraceCtx_->logTraceEvent(name, TraceLevel::OPERATOR,
-                                 TraceEvent::InstantType, entry.hostTime, {});
+    traceContext->logTraceEvent(name, TraceLevel::OPERATOR,
+                                TraceEvent::InstantType, entry.hostTime,
+                                entry.params, affinId);
+  } else if (state == "s" && inflight.count(name) <= 0) {
+    inflight[name] = entry;
+  } else if (state == "c" && inflight.count(name) > 0) {
+    // Add only complate events.
+    if (entry.hostTime > inflight[name].hostTime) {
+      traceContext->logTraceEvent(
+          name, TraceLevel::OPERATOR, TraceEvent::BeginType,
+          inflight[name].hostTime, inflight[name].params, affinId);
+      traceContext->logTraceEvent(name, TraceLevel::OPERATOR,
+                                  TraceEvent::EndType, entry.hostTime,
+                                  entry.params, affinId);
+    } else {
+      LOG(WARNING) << "Fount incomplete trace event " << name;
+    }
+    inflight.erase(name);
+  } else if (state == "po") {
+    traceContext->logTraceEvent(name, TraceLevel::OPERATOR,
+                                TraceEvent::InstantType, entry.hostTime,
+                                entry.params, affinId);
   }
 
   return true;
@@ -200,25 +201,25 @@ bool NNPIDeviceTracing::addTrace(NNPITraceEntry &entry) {
 
 bool NNPIDeviceTracing::stopAndUpdate(TraceContext *traceContext,
                                       NNPIDeviceContext deviceContext) {
-  if (glowTraceCtx_ !=
-          nullptr && // For null glowTraceCtx assume global context (per device)
-      (glowTraceCtx_ != traceContext)) {
-    // Ignore stop from other contexts.
+  if (traceContext == nullptr) {
+    LOG(WARNING) << "Failed to stop trace capture trace context is null.";
     return false;
   }
+  bool isFirstToStop = NNPIDeviceTracing::isFirstToChangeCaptureStart(false);
   if (!traceCtx_->stopCapture(deviceContext)) {
-    LOG(WARNING) << "Failed to stop trace capture";
+    LOG(WARNING) << "Failed to stop trace capture (first device stop ="
+                 << isFirstToStop;
     return false;
   }
 
   if (!traceCtx_->load()) {
-    LOG(WARNING) << "Failed to stop trace capture";
+    LOG(WARNING) << "Failed to stop trace capture =" << isFirstToStop;
     return false;
   }
   traceContext->setThreadName("NNPI_Trace");
+  std::map<std::string, NNPITraceEntry> inflight;
   for (auto entry : traceCtx_->getEntries()) {
-    std::map<std::string, std::string> params = entry.params;
-    addTrace(entry);
+    addTrace(entry, inflight, traceContext);
   }
   started_.clear();
   return true;
diff --git a/lib/Backends/NNPI/NNPITracing.h b/lib/Backends/NNPI/NNPITracing.h
index 7a1872b05a..2d118680a1 100644
--- a/lib/Backends/NNPI/NNPITracing.h
+++ b/lib/Backends/NNPI/NNPITracing.h
@@ -44,25 +44,42 @@ class NNPIDeviceTracing {
     return map[deviceId];
   }
 
+  static bool isFirstToChangeCaptureStart(bool startCapture) {
+    static bool started = false;
+    static std::mutex firstDevStartMutex;
+    std::lock_guard<std::mutex> lk(firstDevStartMutex);
+    if (started != startCapture) {
+      // First to change state.
+      started = startCapture;
+      return true;
+    }
+
+    return false;
+  }
+
   /// Dispose of tracing context.
   virtual ~NNPIDeviceTracing(){};
 
   /// Start recording events.
-  bool start(TraceContext *traceContext, NNPIDeviceContext deviceContext);
+  bool start(TraceContext *traceContext, NNPIDeviceContext deviceContext,
+             bool swTraces, bool hwTraces);
   /// Stop recording, read and update trace context.
   bool stopAndUpdate(TraceContext *traceContext,
                      NNPIDeviceContext deviceContext);
 
 protected:
   std::string getEntryName(NNPITraceEntry &entry);
-  bool addTrace(NNPITraceEntry &entry);
+  bool addTrace(NNPITraceEntry &entry,
+                std::map<std::string, NNPITraceEntry> &inflight,
+                TraceContext *traceContext);
+
+  /// Affinity has to be in a global for all devices.
+  static int getAffinityID(NNPITraceEntry &entry, std::string name,
+                           unsigned deviceId, TraceContext *traceContext);
 
 private:
   /// Per device tracing control.
   explicit NNPIDeviceTracing(unsigned deviceId);
-  /// Glow trace context. Used to identify start/stop and log traces (with
-  /// runId_).
-  TraceContext *glowTraceCtx_{nullptr};
   std::atomic_flag started_{false};
   /// NNPI Trace context.
   std::unique_ptr<NNPITraceContext> traceCtx_;
@@ -70,6 +87,9 @@ class NNPIDeviceTracing {
   unsigned deviceId_{0};
   /// Device id string prefix for event names.
   std::string deviceInfo_;
+
+  /// Trace active affinities.
+  static std::map<std::string, int> activeAffinities_;
 };
 
 } // namespace glow
diff --git a/lib/Backends/NNPI/ONNX/NNPIONNXModelWriter.cpp b/lib/Backends/NNPI/ONNX/NNPIONNXModelWriter.cpp
index 0694abb7ea..f135c0ed4e 100644
--- a/lib/Backends/NNPI/ONNX/NNPIONNXModelWriter.cpp
+++ b/lib/Backends/NNPI/ONNX/NNPIONNXModelWriter.cpp
@@ -18,3 +18,8 @@ Error ONNXModelWriter::writeNNPICustomDSP(glow::NNPICustomDSPNode const *,
                                           GraphType &graph) {
   return MAKE_ERR("Unsupported Op for ONNX");
 }
+
+Error ONNXModelWriter::writeNNPICustomIA(glow::NNPICustomIANode const *,
+                                         GraphType &graph) {
+  return MAKE_ERR("Unsupported Op for ONNX");
+}
diff --git a/lib/Backends/NNPI/tests/NNPIGradCheckTest.cpp b/lib/Backends/NNPI/tests/NNPIGradCheckTest.cpp
index b2f31b0aee..8ec7425f29 100644
--- a/lib/Backends/NNPI/tests/NNPIGradCheckTest.cpp
+++ b/lib/Backends/NNPI/tests/NNPIGradCheckTest.cpp
@@ -44,6 +44,7 @@ struct BlacklistInitializer {
             {"gradientCheckBatchedPairwiseDotProduct/0",
              TestBlacklist::AnyDeviceAnyEngine},
             {"gradientCheckFC2/0", TestBlacklist::AnyDeviceAnyEngine},
+            {"gradientCheckBatchMatMul/0", TestBlacklist::AnyDeviceHWEngine},
         };
     TestBlacklist::prepareBlacklist(testBlacklistedSetups,
                                     backendTestBlacklist);
diff --git a/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp b/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp
index af6591bf55..a31c6c0e87 100644
--- a/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp
+++ b/lib/Backends/NNPI/tests/NNPIOperatorTest.cpp
@@ -103,6 +103,8 @@ struct BlacklistInitializer {
             {"mul_int64/0", TestBlacklist::AnyDeviceHWEngine},
             {"NonCubicKernelConv3DQuantized/0",
              TestBlacklist::AnyDeviceAnyEngine},
+            {"NonCubicPaddingConv3D/0", TestBlacklist::AnyDeviceAnyEngine},
+            {"GroupConv3D/0", TestBlacklist::AnyDeviceHWEngine},
             {"NonSquarePaddingAveragePool/0",
              TestBlacklist::AnyDeviceAnyEngine},
             {"NonSquarePaddingMaxPool/0", TestBlacklist::AnyDeviceAnyEngine},
@@ -135,6 +137,7 @@ struct BlacklistInitializer {
              TestBlacklist::AnyDeviceAnyEngine},
             {"rowwiseQuantizedFCTest_Int8_BiasInt8/0",
              TestBlacklist::AnyDeviceAnyEngine},
+            {"rowwiseQuantizedFCTestSymmetric/0", TestBlacklist::A0AnyEngine},
             {"ScatterAddNDimensionalDuplicatingIndices/0",
              TestBlacklist::AnyDeviceAnyEngine},
             {"ScatterAddNDimensionalSimple/0",
@@ -166,11 +169,6 @@ struct BlacklistInitializer {
              TestBlacklist::AnyDeviceAnyEngine},
             {"EmbeddingBag4BitRowwiseOffsets_Float16_AccumFloat/0",
              TestBlacklist::AnyDeviceAnyEngine},
-            {"EmbeddingBag4BitRowwiseOffsets_Float16_HasEndOffset/0",
-             TestBlacklist::AnyDeviceAnyEngine},
-            {"EmbeddingBag4BitRowwiseOffsets_Float16_HasEndOffset_AccumFloat/0",
-             TestBlacklist::AnyDeviceAnyEngine},
-
             {"SparseToDense_Float/0", TestBlacklist::AnyDeviceAnyEngine},
             {"SparseToDense_Int64/0", TestBlacklist::AnyDeviceAnyEngine},
             {"SparseToDenseMask1/0", TestBlacklist::AnyDeviceAnyEngine},
@@ -231,9 +229,6 @@ struct BlacklistInitializer {
             {"FusedRowwiseQuantizedSparseLengthsWeightedSum_ConvertedFloat16_"
              "NoFusedConvert_FP32Accum/0",
              TestBlacklist::AnyDeviceHWEngine},
-            {"FusedRowwiseQuantizedSparseLengthsSum_Fused4Bit_Float16_"
-             "AccumFloat16/0",
-             TestBlacklist::AnyDeviceHWEngine},
             {"to_back2/0", TestBlacklist::AnyDeviceHWEngine},
             {"GroupDilatedConvolution/0", TestBlacklist::AnyDeviceHWEngine},
             {"less_int32Cases/0", TestBlacklist::AnyDeviceHWEngine},
diff --git a/lib/Backends/NNPI/tests/TestBlacklist.h b/lib/Backends/NNPI/tests/TestBlacklist.h
index 519b967e85..dd0934eff7 100644
--- a/lib/Backends/NNPI/tests/TestBlacklist.h
+++ b/lib/Backends/NNPI/tests/TestBlacklist.h
@@ -39,6 +39,7 @@ const uint32_t AnyDeviceHWEngine =
     AnyDeviceAnyEngine ^ NNPI_EXECUTION_ENGINE_SW;
 const uint32_t AnyDeviceSWEngine =
     AnyDeviceAnyEngine ^ NNPI_EXECUTION_ENGINE_HW;
+const uint32_t A0AnyEngine = NNPI_DEVICE_VERSION_1 | NNPI_EXECUTION_ENGINE_ANY;
 
 static uint32_t getCurrentDeviceVersion() {
   static const std::map<std::string, NNPI_DEVICE_VERSION> devices = {