Run pytorch mobile benchmark in PEP (#28437)

Summary: Pull Request resolved: #28437 Add target to build speed_benchmark_torch for PEP. Added a new argument `--report_pep` to print total runtime information for PEP. Can add per-op stats under this later. Test Plan: https://our.intern.facebook.com/intern/aibench/details/664440309179004 Reviewed By: hl475 Differential Revision: D18062059 fbshipit-source-id: ca80e980ce8e48604782a15ac44dd8d403832817
pytorch · Oct 22, 2019 · 2cc0f1b · 2cc0f1b
1 parent 5f15632
commit 2cc0f1b
Showing 1 changed file with 33 additions and 3 deletions.
diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc
@@ -24,6 +24,9 @@
 #include "torch/csrc/jit/import.h"
 #include "torch/script.h"
 
+#include <chrono>
+using namespace std::chrono;
+
 C10_DEFINE_string(model, "", "The given torch script model to benchmark.");
 C10_DEFINE_string(
     input_dims,
@@ -40,6 +43,23 @@ C10_DEFINE_bool(
   "Whether to print output with all one input tensor.");
 C10_DEFINE_int(warmup, 0, "The number of iterations to warm up.");
 C10_DEFINE_int(iter, 10, "The number of iterations to run.");
+C10_DEFINE_bool(
+  report_pep,
+  false,
+  "Whether to print performance stats for AI-PEP.");
+
+std::vector<std::string>
+split(char separator, const std::string& string, bool ignore_empty = true) {
+  std::vector<std::string> pieces;
+  std::stringstream ss(string);
+  std::string item;
+  while (getline(ss, item, separator)) {
+    if (!ignore_empty || !item.empty()) {
+      pieces.push_back(std::move(item));
+    }
+  }
+  return pieces;
+}
 
 int main(int argc, char** argv) {
   c10::SetUsageMessage(
@@ -59,16 +79,16 @@ int main(int argc, char** argv) {
   CAFFE_ENFORCE_GE(FLAGS_input_dims.size(), 0, "Input dims must be specified.");
   CAFFE_ENFORCE_GE(FLAGS_input_type.size(), 0, "Input type must be specified.");
 
-  std::vector<std::string> input_dims_list = caffe2::split(';', FLAGS_input_dims);
-  std::vector<std::string> input_type_list = caffe2::split(';', FLAGS_input_type);
+  std::vector<std::string> input_dims_list = split(';', FLAGS_input_dims);
+  std::vector<std::string> input_type_list = split(';', FLAGS_input_type);
   CAFFE_ENFORCE_EQ(
       input_dims_list.size(),
       input_type_list.size(),
       "Input dims and type should have the same number of items.");
 
   std::vector<c10::IValue> inputs;
   for (size_t i = 0; i < input_dims_list.size(); ++i) {
-    auto input_dims_str = caffe2::split(',', input_dims_list[i]);
+    auto input_dims_str = split(',', input_dims_list[i]);
     std::vector<int64_t> input_dims;
     for (const auto& s : input_dims_str) {
       input_dims.push_back(c10::stoi(s));
@@ -112,11 +132,21 @@ int main(int argc, char** argv) {
       FLAGS_iter,
       ".");
   caffe2::Timer timer;
+  std::vector<float> times;
   auto millis = timer.MilliSeconds();
   for (int i = 0; i < FLAGS_iter; ++i) {
+    auto start = high_resolution_clock::now();
     module.forward(inputs);
+    auto stop = high_resolution_clock::now();
+    auto duration = duration_cast<microseconds>(stop - start);
+    times.push_back(duration.count());
   }
   millis = timer.MilliSeconds();
+  if (FLAGS_report_pep) {
+    for (auto t : times) {
+      std::cout << "PyTorchObserver {\"type\": \"NET\", \"unit\": \"us\", \"metric\": \"latency\", \"value\": \"" << t << "\"}" << std::endl;
+    }
+  }
   std::cout << "Main run finished. Milliseconds per iter: "
             << millis / FLAGS_iter
             << ". Iters per second: " << 1000.0 * FLAGS_iter / millis