pytorch · yinghai · Feb 3, 2020
diff --git a/caffe2/operators/reduce_ops.cc b/caffe2/operators/reduce_ops.cc
@@ -202,6 +202,43 @@ print("Y:", workspace.FetchBlob("Y"))
 </details>
 
 )DOC")
+    .TensorInferenceFunction([](const OperatorDef& def,
+                                const std::vector<TensorShape>& in) {
+      if (in.size() != 1) {
+        return std::vector<TensorShape>{
+            CreateTensorShape({}, TensorProto_DataType_UNDEFINED)};
+      }
+
+      const auto& dims = in.front().dims();
+      ArgumentHelper helper(def);
+      std::vector<TensorShape> out;
+      out.emplace_back();
+      auto& ts = out.back();
+      auto axis = helper.GetRepeatedArgument<int32_t>("axes");
+      std::sort(axis.begin(), axis.end());
+      auto keepdims = helper.GetSingleArgument<bool>("keepdims", true);
+      size_t cursor = 0;
+      size_t id = 0;
+      for (const auto d : dims) {
+        if (cursor < axis.size() && id == axis[cursor]) {
+          if (keepdims) {
+            ts.add_dims(d == 0 ? 0 : 1);
+          }
+          ++cursor;
+        } else {
+          ts.add_dims(d);
+        }
+        ++id;
+      }
+      if (ts.dims_size() == 0 && dims.size() != 0) {
+        ts.add_dims(1);
+      }
+      if (cursor != axis.size()) {
+        ts.set_unknown_shape(true);
+      }
+      ts.set_data_type(in.front().data_type());
+      return out;
+    })
     .Arg("axes", "(*Tuple(int)*): list of axes to reduce")
     .Arg(
         "keepdims",

diff --git a/caffe2/opt/backend_transformer_base.cc b/caffe2/opt/backend_transformer_base.cc
@@ -43,9 +43,9 @@ std::string BackendTransformerBase::getModelId(const NetDef& net) {
   return model_id;
 }
 
-TensorProto BackendTransformerBase::wrapShapeInfoIntoTensorProto(
+TensorProto wrapShapeInfoIntoTensorProto(
     const std::string& name,
-    const ShapeInfo& shape_info) const {
+    const ShapeInfo& shape_info) {
   TensorProto t;
   t.set_name(name);
   t.set_data_type(shape_info.shape.data_type());
@@ -58,9 +58,9 @@ TensorProto BackendTransformerBase::wrapShapeInfoIntoTensorProto(
   return t;
 }
 
-QTensorProto BackendTransformerBase::wrapShapeInfoIntoQTensorProto(
+QTensorProto wrapShapeInfoIntoQTensorProto(
     const std::string& name,
-    const ShapeInfo& shape_info) const {
+    const ShapeInfo& shape_info) {
   QTensorProto t;
   CAFFE_ENFORCE(
       shape_info.is_quantized == true,

diff --git a/caffe2/opt/backend_transformer_base.h b/caffe2/opt/backend_transformer_base.h
@@ -29,6 +29,16 @@ struct BackendTransformOptions {
   BoundShapeSpec bound_shape_spec;
 };
 
+// Wrap TensorShape into TensorProto
+TensorProto wrapShapeInfoIntoTensorProto(
+    const std::string& name,
+    const ShapeInfo& shape_info);
+
+// Wrap Quantized TensorShape into QTensorProto
+QTensorProto wrapShapeInfoIntoQTensorProto(
+    const std::string& name,
+    const ShapeInfo& shape_info);
+
 // This class contains some common functions for backend lowering and graph
 // cutting
 class BackendTransformerBase {
@@ -73,16 +83,6 @@ class BackendTransformerBase {
       NetDef* pred_net,
       const ShapeInfoMap& input_shape_hints);
 
-  // Wrap TensorShape into TensorProto
-  TensorProto wrapShapeInfoIntoTensorProto(
-      const std::string& name,
-      const ShapeInfo& shape_info) const;
-
-  // Wrap Quantized TensorShape into QTensorProto
-  QTensorProto wrapShapeInfoIntoQTensorProto(
-      const std::string& name,
-      const ShapeInfo& shape_info) const;
-
   // Do bound shape inference and collect shape infos
   ShapeInfoMap inferShapes(
       Workspace* ws,

diff --git a/caffe2/opt/bound_shape_inferencer.cc b/caffe2/opt/bound_shape_inferencer.cc
@@ -23,7 +23,9 @@ std::vector<TensorBoundShape::DimType> setDimTypeWithFirst(
     uint32_t n) {
   std::vector<TensorBoundShape::DimType> dimTypes(
       n, TensorBoundShape_DimType_CONSTANT);
-  dimTypes[0] = firstDimType;
+  if (dimTypes.size() > 0) {
+    dimTypes[0] = firstDimType;
+  }
   return dimTypes;
 }
 

diff --git a/caffe2/opt/custom/glow_net_transform.cc b/caffe2/opt/custom/glow_net_transform.cc
@@ -12,6 +12,11 @@ C10_DEFINE_bool(
     true,
     "Attach AdjustBatch ops at input/outputs of the Onnxifi ops");
 
+C10_DEFINE_bool(
+    onnxifi_loop_test_mode,
+    false,
+    "For test purpose only. Build a dummy net just to test the functionality");
+
 C10_DEFINE_bool(
     merge_fp32_inputs_into_fp16,
     false,
@@ -121,6 +126,7 @@ void onnxifi(
   opts.min_ops = FLAGS_onnxifi_min_ops;
   opts.load_model_by_blob = load_model_by_blob;
   opts.merge_fp32_inputs_into_fp16 = FLAGS_merge_fp32_inputs_into_fp16;
+  opts.loop_test = FLAGS_onnxifi_loop_test_mode;
 
   auto more_shape_hints = shape_hints;
   if (!FLAGS_onnxifi_shape_hints.empty()) {

diff --git a/caffe2/opt/onnxifi_transformer.cc b/caffe2/opt/onnxifi_transformer.cc
@@ -311,6 +311,235 @@ void mergeFp32InputsAndConvertToFp16(
   }
 }
 
+NetDef buildLoopTestNet(
+    const NetDef& net,
+    const std::unordered_set<std::string>& initialization_list,
+    std::unordered_map<std::string, ShapeInfo>* shape_hints,
+    size_t batch_size) {
+  NetDef net_dummy;
+
+  // Add non-weigh inputs only
+  for (const auto& i : net.external_input()) {
+    if (!initialization_list.count(i)) {
+      net_dummy.add_external_input(i);
+    }
+  }
+  for (const auto& o : net.external_output()) {
+    net_dummy.add_external_output(o);
+  }
+
+  // Now categorize the inputs into the following groups. We don't support
+  // handling of 3d inputs yet, but it can be done easily by converting n-d
+  // inputs into 2-d with Reshape or ReduceSum
+  std::unordered_set<std::string> batched_2d_inputs;
+  std::unordered_set<std::string> other_2d_inputs;
+  std::unordered_set<std::string> all_1d_inputs;
+  auto addCast = [&net_dummy](
+                     const std::string& i,
+                     std::string& in,
+                     caffe2::TensorProto::DataType dtype) mutable {
+    int multiplier = 1;
+    if (dtype != caffe2::TensorProto::FLOAT) {
+      in += "_fp32";
+      net_dummy.add_op()->CopyFrom(CreateOperatorDef(
+          "Clip",
+          "",
+          {i},
+          {in},
+          {MakeArgument<float>("min", 0.0), MakeArgument<float>("max", 1.0)}));
+      if (dtype == caffe2::TensorProto::INT8 ||
+          dtype == caffe2::TensorProto::UINT8) {
+        multiplier = sizeof(float) / sizeof(int8_t);
+      } else if (
+          dtype == caffe2::TensorProto::INT16 ||
+          dtype == caffe2::TensorProto::UINT16 ||
+          dtype == caffe2::TensorProto::FLOAT16) {
+        multiplier = sizeof(float) / sizeof(int16_t);
+      } else if (dtype == caffe2::TensorProto::INT64) {
+        // Special case, it should really be 0.5
+        multiplier = 0;
+      }
+    }
+    return multiplier;
+  };
+  auto adjustDim = [](int d, int m, TensorShape& shape) {
+    if (m > 1) {
+      CAFFE_ENFORCE_EQ(shape.dims(d) % m, 0);
+      shape.set_dims(d, shape.dims(d) / m);
+    } else if (m == 0) {
+      shape.set_dims(d, shape.dims(d) * 2);
+    }
+    shape.set_data_type(caffe2::TensorProto::FLOAT);
+  };
+  size_t dim2 = 0;
+  for (const auto& i : net_dummy.external_input()) {
+    auto it = shape_hints->find(i);
+    CAFFE_ENFORCE(
+        it != shape_hints->end(), "Cannot find shape info for input ", i);
+    auto& shape = it->second.shape;
+    std::string in = i;
+    // Trick here: since backend like glow doesn't support non-float
+    // arithmatics, we need to be creative and bitcast non-float data type into
+    // float while maintaining the same bit lengths. We do this by changing the
+    // shape dim. So that we will always load the same amount of bits onto the
+    // backend. To avoid numeric complication, we add a Clip.
+    if (shape.dims_size() == 2) {
+      auto m = addCast(i, in, shape.data_type());
+      adjustDim(1, m, shape);
+      if (shape.dims(0) == batch_size) {
+        batched_2d_inputs.emplace(in);
+        dim2 += shape.dims(1);
+      } else {
+        other_2d_inputs.emplace(in);
+      }
+    } else if (shape.dims_size() == 1) {
+      auto m = addCast(i, in, shape.data_type());
+      adjustDim(0, m, shape);
+      all_1d_inputs.emplace(in);
+    } else {
+      const std::string fin = i + "_flatten";
+      net_dummy.add_op()->CopyFrom(
+          CreateOperatorDef("Flatten", "", {i}, {fin}, {}));
+      in = fin;
+      auto m = addCast(fin, in, shape.data_type());
+      auto last = shape.dims_size() - 1;
+      adjustDim(last, m, shape);
+      size_t ndim = 1;
+      for (unsigned k = 1; k < shape.dims_size(); ++k) {
+        ndim *= shape.dims(k);
+      }
+      if (shape.dims(0) == batch_size) {
+        batched_2d_inputs.emplace(in);
+        dim2 += ndim;
+      } else {
+        other_2d_inputs.emplace(in);
+      }
+    }
+  }
+
+  // Add adjusted shape hints
+  auto* shape_arg = net_dummy.add_arg();
+  auto* qshape_arg = net_dummy.add_arg();
+  shape_arg->set_name("input_shape_info");
+  qshape_arg->set_name("input_qshape_info");
+  for (const auto& i : net_dummy.external_input()) {
+    auto info = shape_hints->at(i);
+    if (!info.is_quantized) {
+      shape_arg->mutable_tensors()->Add()->CopyFrom(
+          wrapShapeInfoIntoTensorProto(i, info));
+    } else {
+      qshape_arg->mutable_qtensors()->Add()->CopyFrom(
+          wrapShapeInfoIntoQTensorProto(i, info));
+    }
+  }
+
+  // Collect all the input together into a 2d tensor of {batch_size, X}
+  std::vector<std::string> concat2d_batched(
+      batched_2d_inputs.begin(), batched_2d_inputs.end());
+  const std::string concat_out = "batch_2d_concat";
+  net_dummy.add_op()->CopyFrom(CreateOperatorDef(
+      "Concat",
+      "",
+      concat2d_batched,
+      {concat_out, "batch_2d_concat_split_info"},
+      {MakeArgument<int>("axis", 1)}));
+  std::vector<std::string> scalars;
+  for (const auto& i : other_2d_inputs) {
+    std::string o = i + "_reduced";
+    net_dummy.add_op()->CopyFrom(CreateOperatorDef(
+        "ReduceSum",
+        "",
+        {i},
+        {o},
+        {MakeArgument<std::vector<int>>("axes", {0, 1}),
+         MakeArgument<int>("keepdims", 0)}));
+    scalars.emplace_back(std::move(o));
+  }
+  for (const auto& i : all_1d_inputs) {
+    std::string o = i + "_reduced";
+    net_dummy.add_op()->CopyFrom(CreateOperatorDef(
+        "ReduceSum",
+        "",
+        {i},
+        {o},
+        {MakeArgument<std::vector<int>>("axes", {0}),
+         MakeArgument<int>("keepdims", 0)}));
+    scalars.emplace_back(std::move(o));
+  }
+  const std::string summed = "summed";
+  net_dummy.add_op()->CopyFrom(
+      CreateOperatorDef("Sum", "", scalars, {summed}, {}));
+  const std::string out = "result_out";
+  net_dummy.add_op()->CopyFrom(CreateOperatorDef(
+      "Add",
+      "",
+      {concat_out, summed},
+      {out},
+      {MakeArgument<int>("broadcast", 1)}));
+
+  for (const auto& o : net_dummy.external_output()) {
+    const auto it = shape_hints->find(o);
+    CAFFE_ENFORCE(
+        it != shape_hints->end(), "Cannot find shape info for output ", o);
+    const auto& shape = it->second.shape;
+    // TODO: all doable but I'm lazy
+    if (shape.data_type() != caffe2::TensorProto::FLOAT) {
+      CAFFE_THROW("We need a Cast op to match the output data type");
+    }
+    if (shape.dims_size() == 2) {
+      if (shape.dims(0) == batch_size) {
+        if (shape.dims(1) > dim2) {
+          CAFFE_THROW(
+              "We need Tile op to match the output dim ",
+              shape.dims(1),
+              " vs ",
+              dim2);
+        } else if (shape.dims(1) == dim2) {
+          net_dummy.add_op()->CopyFrom(
+              CreateOperatorDef("Copy", "", {out}, {o}, {}));
+        } else {
+          net_dummy.add_op()->CopyFrom(CreateOperatorDef(
+              "Slice",
+              "",
+              {out},
+              {o},
+              {MakeArgument<std::vector<int>>("starts", {0, 0}),
+               MakeArgument<std::vector<int>>(
+                   "ends", {-1, static_cast<int>(shape.dims(1))})}));
+        }
+      }
+    } else if (shape.dims_size() == 1) {
+      if (shape.dims(0) == batch_size) {
+        const std::string oi = o + "_pre";
+        net_dummy.add_op()->CopyFrom(CreateOperatorDef(
+            "Slice",
+            "",
+            {out},
+            {oi},
+            {MakeArgument<std::vector<int>>("starts", {0, 0}),
+             MakeArgument<std::vector<int>>("ends", {-1, 1})}));
+        net_dummy.add_op()->CopyFrom(CreateOperatorDef(
+            "Reshape",
+            "",
+            {oi},
+            {o},
+            {MakeArgument<std::vector<int>>(
+                "shape", {static_cast<int>(batch_size)})}));
+      } else {
+        CAFFE_THROW(
+            "We need Slice and Tile op to match the output dim ",
+            shape.dims(0),
+            " vs ",
+            batch_size);
+      }
+    } else {
+      CAFFE_THROW("Only support 1D/2D outputs for now");
+    }
+  }
+
+  return net_dummy;
+}
+
 } // namespace
 
 OnnxifiTransformer::OnnxifiTransformer(const OnnxifiTransformerOptions& opts)
@@ -506,6 +735,18 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
     output_shape_hints.emplace(o, shape);
   }
 
+  // Rewrite the net into a dummy in loop test mode
+  ShapeInfoMap new_shape_hints;
+  if (opts_.loop_test) {
+    new_shape_hints = shape_hints;
+    onnxifi_net = buildLoopTestNet(
+        onnxifi_net,
+        initialization_list,
+        &new_shape_hints,
+        opts_.bound_shape_spec.max_batch_size);
+    initialization_list.clear();
+  }
+
   // Build ONNXIFI Op
   std::vector<std::string> onnxifi_net_inputs(
       onnxifi_net.external_input().begin(), onnxifi_net.external_input().end());
@@ -520,7 +761,7 @@ NetDef OnnxifiTransformer::SubnetToOnnxifiOpViaC2(
       initialization_list,
       onnxifi_net_inputs,
       onnxifi_net_outputs,
-      shape_hints);
+      opts_.loop_test ? new_shape_hints : shape_hints);
   NetDef net_opt = composeResultNet(onnxifi_op);
 
   // Debugging stuff