Force a sync on non-CPU tensors for the benchmark to reflect the timi…

…ng accurately. ghstack-source-id: 5c8e310984bf160719379dce88cdc697bed82241 Pull Request resolved: #47714
pytorch · Dec 4, 2020 · bac0c75 · bac0c75
1 parent cb28508
commit bac0c75
Showing 1 changed file with 10 additions and 2 deletions.
diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc
@@ -217,7 +217,11 @@ int main(int argc, char** argv) {
       FLAGS_warmup,
       ".");
   for (int i = 0; i < FLAGS_warmup; ++i) {
-    module.forward(inputs);
+    if (FLAGS_vulkan) {
+      module.forward(inputs).toTensor().cpu();
+    } else {
+      module.forward(inputs);
+    }
   }
 
   std::cout << "Main runs." << std::endl;
@@ -231,7 +235,11 @@ int main(int argc, char** argv) {
   auto micros = timer.MicroSeconds();
   for (int i = 0; i < FLAGS_iter; ++i) {
     auto start = high_resolution_clock::now();
-    module.forward(inputs);
+    if (FLAGS_vulkan) {
+      module.forward(inputs).toTensor().cpu();
+    } else {
+      module.forward(inputs);
+    }
     auto stop = high_resolution_clock::now();
     auto duration = duration_cast<microseconds>(stop - start);
     times.push_back(duration.count());