From 022c929145e96ff6992dd51ce1f58e9b541773be Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Wed, 2 Dec 2020 11:07:56 -0800
Subject: [PATCH] Revert "Revert D25199264: Enable callgrind collection for C++
 snippets" (#48720)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/48720

This reverts commit 6646ff122d3215b77909f669fc26cf6a927030db.

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D25273994

Pulled By: malfet

fbshipit-source-id: 61743176dc650136622e1b8f2384bbfbd7a46294
---
 test/benchmark_utils/test_benchmark_utils.py  | 29 ++++++++++
 torch/utils/benchmark/utils/cpp_jit.py        | 15 ++++-
 torch/utils/benchmark/utils/timer.py          |  8 +--
 .../timer_callgrind_template.cpp              | 47 ++++++++++++++++
 .../utils/valgrind_wrapper/timer_interface.py | 56 ++++++++++++-------
 torch/utils/cpp_extension.py                  |  3 +
 6 files changed, 133 insertions(+), 25 deletions(-)
 create mode 100644 torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp

diff --git a/test/benchmark_utils/test_benchmark_utils.py b/test/benchmark_utils/test_benchmark_utils.py
index 779a704bfd92..e15a0fec338b 100644
--- a/test/benchmark_utils/test_benchmark_utils.py
+++ b/test/benchmark_utils/test_benchmark_utils.py
@@ -2,6 +2,7 @@
 import os
 import re
 import textwrap
+import timeit
 from typing import Any, List, Tuple
 import unittest
 
@@ -168,6 +169,7 @@ def test_cpp_timer(self):
         timer = benchmark_utils.Timer(
             "torch::Tensor y = x + 1;",
             setup="torch::Tensor x = torch::empty({1});",
+            timer=timeit.default_timer,
             language=benchmark_utils.Language.CPP,
         )
         t = timer.timeit(10)
@@ -449,6 +451,7 @@ class MockCudaTimer(benchmark_utils.Timer):
 
     @slowTest
     @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.")
+    @unittest.skipIf(IS_SANDCASTLE, "Valgrind is OSS only.")
     def test_collect_callgrind(self):
         with self.assertRaisesRegex(
             ValueError,
@@ -505,6 +508,32 @@ def add_one(x):
             "JIT'd bindings are only for back testing."
         )
 
+    @slowTest
+    @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.")
+    @unittest.skipIf(IS_SANDCASTLE, "Valgrind is OSS only.")
+    def test_collect_cpp_callgrind(self):
+        timer = benchmark_utils.Timer(
+            "x += 1;",
+            setup="torch::Tensor x = torch::ones({1});",
+            timer=timeit.default_timer,
+            language="c++",
+        )
+        stats = [
+            timer.collect_callgrind()
+            for _ in range(3)
+        ]
+        counts = [s.counts() for s in stats]
+
+        self.assertGreater(
+            min(counts), 0, "No stats were collected")
+        self.assertEqual(
+            min(counts), max(counts), "C++ Callgrind should be deterministic")
+
+        for s in stats:
+            self.assertEqual(
+                s.counts(denoise=True), s.counts(denoise=False),
+                "De-noising should not apply to C++.")
+
     def test_manipulate_callgrind_stats(self):
         stats_no_data, stats_with_data = load_callgrind_artifacts()
 
diff --git a/torch/utils/benchmark/utils/cpp_jit.py b/torch/utils/benchmark/utils/cpp_jit.py
index ebaa4213e027..3ec3b6c097d8 100644
--- a/torch/utils/benchmark/utils/cpp_jit.py
+++ b/torch/utils/benchmark/utils/cpp_jit.py
@@ -3,6 +3,7 @@
 import os
 import re
 import shutil
+import tempfile
 import textwrap
 import threading
 import uuid
@@ -30,8 +31,8 @@
 # `setup` and `stmt` do not change, so we can reuse the executable from the
 # first pass through the loop.
 BUILD_ROOT = os.path.join(
-    torch._appdirs.user_cache_dir(appname="benchmark_utils_jit"),
-    f"build_{uuid.uuid4()}".replace("-", "")
+    tempfile.gettempdir(),
+    f"benchmark_utils_jit_build_{uuid.uuid4()}".replace("-", "")
 )
 
 # BACK_TESTING_NOTE:
@@ -141,3 +142,13 @@ def compile_timeit_template(stmt: str, setup: str) -> TimeitModuleType:
     module = _compile_template(stmt, setup, src, is_standalone=False)
     assert isinstance(module, TimeitModuleType)
     return module
+
+
+def compile_callgrind_template(stmt: str, setup: str) -> str:
+    template_path: str = os.path.join(SOURCE_ROOT, "valgrind_wrapper", "timer_callgrind_template.cpp")
+    with open(template_path, "rt") as f:
+        src: str = f.read()
+
+    target = _compile_template(stmt, setup, src, is_standalone=True)
+    assert isinstance(target, str)
+    return target
diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py
index 374b8bd6c6e0..5bf37a3c9dec 100644
--- a/torch/utils/benchmark/utils/timer.py
+++ b/torch/utils/benchmark/utils/timer.py
@@ -421,15 +421,15 @@ def collect_callgrind(
         if not isinstance(self._task_spec.stmt, str):
             raise ValueError("`collect_callgrind` currently only supports string `stmt`")
 
-        if self._language != Language.PYTHON:
-            raise NotImplementedError("C++ Callgrind is later in the stack.")
-
         # Check that the statement is valid. It doesn't guarantee success, but it's much
         # simpler and quicker to raise an exception for a faulty `stmt` or `setup` in
         # the parent process rather than the valgrind subprocess.
         self._timer.timeit(1)
+        is_python = (self._language == Language.PYTHON)
+        assert is_python or not self._globals
         return valgrind_timer_interface.wrapper_singleton().collect_callgrind(
             task_spec=self._task_spec,
             globals=self._globals,
             number=number,
-            collect_baseline=collect_baseline)
+            collect_baseline=collect_baseline and is_python,
+            is_python=is_python)
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp b/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp
new file mode 100644
index 000000000000..a64484f70924
--- /dev/null
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp
@@ -0,0 +1,47 @@
+/* C++ template for Timer.collect_callgrind
+
+This template will be consumed by `cpp_jit.py`, and will replace:
+    `SETUP_TEMPLATE_LOCATION`
+      and
+    `STMT_TEMPLATE_LOCATION`
+sections with user provided statements.
+*/
+
+#include <string>
+
+#include <callgrind.h>
+#include <torch/torch.h>
+
+#if defined(NVALGRIND)
+static_assert(false);
+#endif
+
+int main(int argc, char* argv[]) {
+    // This file should only be called inside of `Timer`, so we can adopt a
+    // very simple and rigid argument parsing scheme.
+    TORCH_CHECK(argc == 7);
+    TORCH_CHECK(std::string(argv[1]) == "--number");
+    auto number = std::stoi(argv[2]);
+
+    TORCH_CHECK(std::string(argv[3]) == "--number_warmup");
+    auto number_warmup = std::stoi(argv[4]);
+
+    TORCH_CHECK(std::string(argv[5]) == "--number_threads");
+    auto number_threads = std::stoi(argv[6]);
+    torch::set_num_threads(number_threads);
+
+    // Setup
+    // SETUP_TEMPLATE_LOCATION
+
+    // Warmup
+    for (int i = 0; i < number_warmup; i++) {
+        // STMT_TEMPLATE_LOCATION
+    }
+
+    // Main loop
+    CALLGRIND_TOGGLE_COLLECT;
+    for (int i = 0; i < number; i++) {
+        // STMT_TEMPLATE_LOCATION
+    }
+    CALLGRIND_TOGGLE_COLLECT;
+}
diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
index b8513671beb9..64c5d9793e80 100644
--- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
+++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
@@ -483,10 +483,13 @@ def collect_callgrind(
         task_spec: common.TaskSpec,
         globals: Dict[str, Any],
         number: int,
-        collect_baseline: bool
+        collect_baseline: bool,
+        is_python: bool,
     ) -> CallgrindStats:
         """Collect stats, and attach a reference run which can be used to filter interpreter overhead."""
         self._validate()
+        assert is_python or not collect_baseline
+
         baseline_inclusive_stats = FunctionCounts((), inclusive=True)
         baseline_exclusive_stats = FunctionCounts((), inclusive=False)
         if collect_baseline:
@@ -496,15 +499,17 @@ def collect_callgrind(
                     common.TaskSpec(
                         stmt="pass",
                         setup="pass",
-                        num_threads=task_spec.num_threads
+                        num_threads=task_spec.num_threads,
                     ),
                     globals={},
                     number=number,
+                    is_python=True,
                 )
             baseline_inclusive_stats, baseline_exclusive_stats = \
                 self._baseline_cache[cache_key]
 
-        stmt_inclusive_stats, stmt_exclusive_stats = self._invoke(task_spec, globals, number)
+        stmt_inclusive_stats, stmt_exclusive_stats = self._invoke(
+            task_spec, globals, number, is_python)
         return CallgrindStats(
             task_spec=task_spec,
             number_per_run=number,
@@ -520,6 +525,7 @@ def _invoke(
         task_spec: common.TaskSpec,
         globals: Dict[str, Any],
         number: int,
+        is_python: bool,
     ) -> Tuple[FunctionCounts, FunctionCounts]:
         """Core invocation method for Callgrind collection.
 
@@ -565,20 +571,34 @@ def run(args: List[str], **kwargs: Any) -> Tuple[CompletedProcessType, str]:
                 f_stdout_stderr.close()
 
         try:
-            if self._bindings_module is not None:
-                shutil.copy(
-                    self._bindings_module.__file__,
-                    os.path.join(working_dir, os.path.split(self._bindings_module.__file__)[1])
+            if is_python:
+                if self._bindings_module is not None:
+                    shutil.copy(
+                        self._bindings_module.__file__,
+                        os.path.join(working_dir, os.path.split(self._bindings_module.__file__)[1])
+                    )
+
+                script_file = os.path.join(working_dir, "timer_callgrind.py")
+                with open(script_file, "wt") as f:
+                    f.write(self._construct_script(
+                        task_spec,
+                        globals=GlobalsBridge(globals, data_dir),
+                        number=number,
+                        error_log=error_log,
+                        stat_log=stat_log,
+                        bindings=self._bindings_module))
+                run_loop_cmd = ["python", script_file]
+            else:
+                run_loop_exec = cpp_jit.compile_callgrind_template(
+                    task_spec.stmt,
+                    task_spec.setup,
                 )
-
-            with open(script_file, "wt") as f:
-                f.write(self._construct_script(
-                    task_spec,
-                    globals=GlobalsBridge(globals, data_dir),
-                    number=number,
-                    error_log=error_log,
-                    stat_log=stat_log,
-                    bindings=self._bindings_module))
+                run_loop_cmd = [
+                    run_loop_exec,
+                    "--number", str(number),
+                    "--number_warmup", str(min(number, 10)),
+                    "--number_threads", str(task_spec.num_threads),
+                ]
 
             valgrind_invocation, valgrind_invocation_output = run([
                 "valgrind",
@@ -588,9 +608,7 @@ def run(args: List[str], **kwargs: Any) -> Tuple[CompletedProcessType, str]:
                 "--dump-instr=yes",
                 "--instr-atstart=yes",
                 "--collect-atstart=no",
-                "python",
-                script_file,
-            ])
+            ] + run_loop_cmd)
 
             if valgrind_invocation.returncode:
                 error_report = ""
diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py
index 993b04ca23d8..22b20636d6c1 100644
--- a/torch/utils/cpp_extension.py
+++ b/torch/utils/cpp_extension.py
@@ -1393,6 +1393,9 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
         if not is_standalone:
             extra_ldflags.append('-ltorch_python')
 
+        if is_standalone and "TBB" in torch.__config__.parallel_info():
+            extra_ldflags.append('-ltbb')
+
         if is_standalone:
             extra_ldflags.append(f"-Wl,-rpath,{TORCH_LIB_PATH}")