Force a sync on non-CPU tensors for the benchmark to reflect the timi…

…ng accurately. ghstack-source-id: 758cd360497878e8d4415b208b28dd966bf141f5 Pull Request resolved: #47714
pytorch · Dec 2, 2020 · 58e4319 · 58e4319
1 parent fa4527a
commit 58e4319
Showing 1 changed file with 2 additions and 2 deletions.
diff --git a/binaries/speed_benchmark_torch.cc b/binaries/speed_benchmark_torch.cc
@@ -217,7 +217,7 @@ int main(int argc, char** argv) {
       FLAGS_warmup,
       ".");
   for (int i = 0; i < FLAGS_warmup; ++i) {
-    module.forward(inputs);
+    module.forward(inputs).toTensor().cpu();
   }
 
   std::cout << "Main runs." << std::endl;
@@ -231,7 +231,7 @@ int main(int argc, char** argv) {
   auto micros = timer.MicroSeconds();
   for (int i = 0; i < FLAGS_iter; ++i) {
     auto start = high_resolution_clock::now();
-    module.forward(inputs);
+    module.forward(inputs).toTensor().cpu();
     auto stop = high_resolution_clock::now();
     auto duration = duration_cast<microseconds>(stop - start);
     times.push_back(duration.count());