From 022c929145e96ff6992dd51ce1f58e9b541773be Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Wed, 2 Dec 2020 11:07:56 -0800 Subject: [PATCH] Revert "Revert D25199264: Enable callgrind collection for C++ snippets" (#48720) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/48720 This reverts commit 6646ff122d3215b77909f669fc26cf6a927030db. Test Plan: Imported from OSS Reviewed By: malfet Differential Revision: D25273994 Pulled By: malfet fbshipit-source-id: 61743176dc650136622e1b8f2384bbfbd7a46294 --- test/benchmark_utils/test_benchmark_utils.py | 29 ++++++++++ torch/utils/benchmark/utils/cpp_jit.py | 15 ++++- torch/utils/benchmark/utils/timer.py | 8 +-- .../timer_callgrind_template.cpp | 47 ++++++++++++++++ .../utils/valgrind_wrapper/timer_interface.py | 56 ++++++++++++------- torch/utils/cpp_extension.py | 3 + 6 files changed, 133 insertions(+), 25 deletions(-) create mode 100644 torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp diff --git a/test/benchmark_utils/test_benchmark_utils.py b/test/benchmark_utils/test_benchmark_utils.py index 779a704bfd92..e15a0fec338b 100644 --- a/test/benchmark_utils/test_benchmark_utils.py +++ b/test/benchmark_utils/test_benchmark_utils.py @@ -2,6 +2,7 @@ import os import re import textwrap +import timeit from typing import Any, List, Tuple import unittest @@ -168,6 +169,7 @@ def test_cpp_timer(self): timer = benchmark_utils.Timer( "torch::Tensor y = x + 1;", setup="torch::Tensor x = torch::empty({1});", + timer=timeit.default_timer, language=benchmark_utils.Language.CPP, ) t = timer.timeit(10) @@ -449,6 +451,7 @@ class MockCudaTimer(benchmark_utils.Timer): @slowTest @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.") + @unittest.skipIf(IS_SANDCASTLE, "Valgrind is OSS only.") def test_collect_callgrind(self): with self.assertRaisesRegex( ValueError, @@ -505,6 +508,32 @@ def add_one(x): "JIT'd bindings are only for back testing." ) + @slowTest + @unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.") + @unittest.skipIf(IS_SANDCASTLE, "Valgrind is OSS only.") + def test_collect_cpp_callgrind(self): + timer = benchmark_utils.Timer( + "x += 1;", + setup="torch::Tensor x = torch::ones({1});", + timer=timeit.default_timer, + language="c++", + ) + stats = [ + timer.collect_callgrind() + for _ in range(3) + ] + counts = [s.counts() for s in stats] + + self.assertGreater( + min(counts), 0, "No stats were collected") + self.assertEqual( + min(counts), max(counts), "C++ Callgrind should be deterministic") + + for s in stats: + self.assertEqual( + s.counts(denoise=True), s.counts(denoise=False), + "De-noising should not apply to C++.") + def test_manipulate_callgrind_stats(self): stats_no_data, stats_with_data = load_callgrind_artifacts() diff --git a/torch/utils/benchmark/utils/cpp_jit.py b/torch/utils/benchmark/utils/cpp_jit.py index ebaa4213e027..3ec3b6c097d8 100644 --- a/torch/utils/benchmark/utils/cpp_jit.py +++ b/torch/utils/benchmark/utils/cpp_jit.py @@ -3,6 +3,7 @@ import os import re import shutil +import tempfile import textwrap import threading import uuid @@ -30,8 +31,8 @@ # `setup` and `stmt` do not change, so we can reuse the executable from the # first pass through the loop. BUILD_ROOT = os.path.join( - torch._appdirs.user_cache_dir(appname="benchmark_utils_jit"), - f"build_{uuid.uuid4()}".replace("-", "") + tempfile.gettempdir(), + f"benchmark_utils_jit_build_{uuid.uuid4()}".replace("-", "") ) # BACK_TESTING_NOTE: @@ -141,3 +142,13 @@ def compile_timeit_template(stmt: str, setup: str) -> TimeitModuleType: module = _compile_template(stmt, setup, src, is_standalone=False) assert isinstance(module, TimeitModuleType) return module + + +def compile_callgrind_template(stmt: str, setup: str) -> str: + template_path: str = os.path.join(SOURCE_ROOT, "valgrind_wrapper", "timer_callgrind_template.cpp") + with open(template_path, "rt") as f: + src: str = f.read() + + target = _compile_template(stmt, setup, src, is_standalone=True) + assert isinstance(target, str) + return target diff --git a/torch/utils/benchmark/utils/timer.py b/torch/utils/benchmark/utils/timer.py index 374b8bd6c6e0..5bf37a3c9dec 100644 --- a/torch/utils/benchmark/utils/timer.py +++ b/torch/utils/benchmark/utils/timer.py @@ -421,15 +421,15 @@ def collect_callgrind( if not isinstance(self._task_spec.stmt, str): raise ValueError("`collect_callgrind` currently only supports string `stmt`") - if self._language != Language.PYTHON: - raise NotImplementedError("C++ Callgrind is later in the stack.") - # Check that the statement is valid. It doesn't guarantee success, but it's much # simpler and quicker to raise an exception for a faulty `stmt` or `setup` in # the parent process rather than the valgrind subprocess. self._timer.timeit(1) + is_python = (self._language == Language.PYTHON) + assert is_python or not self._globals return valgrind_timer_interface.wrapper_singleton().collect_callgrind( task_spec=self._task_spec, globals=self._globals, number=number, - collect_baseline=collect_baseline) + collect_baseline=collect_baseline and is_python, + is_python=is_python) diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp b/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp new file mode 100644 index 000000000000..a64484f70924 --- /dev/null +++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_callgrind_template.cpp @@ -0,0 +1,47 @@ +/* C++ template for Timer.collect_callgrind + +This template will be consumed by `cpp_jit.py`, and will replace: + `SETUP_TEMPLATE_LOCATION` + and + `STMT_TEMPLATE_LOCATION` +sections with user provided statements. +*/ + +#include + +#include +#include + +#if defined(NVALGRIND) +static_assert(false); +#endif + +int main(int argc, char* argv[]) { + // This file should only be called inside of `Timer`, so we can adopt a + // very simple and rigid argument parsing scheme. + TORCH_CHECK(argc == 7); + TORCH_CHECK(std::string(argv[1]) == "--number"); + auto number = std::stoi(argv[2]); + + TORCH_CHECK(std::string(argv[3]) == "--number_warmup"); + auto number_warmup = std::stoi(argv[4]); + + TORCH_CHECK(std::string(argv[5]) == "--number_threads"); + auto number_threads = std::stoi(argv[6]); + torch::set_num_threads(number_threads); + + // Setup + // SETUP_TEMPLATE_LOCATION + + // Warmup + for (int i = 0; i < number_warmup; i++) { + // STMT_TEMPLATE_LOCATION + } + + // Main loop + CALLGRIND_TOGGLE_COLLECT; + for (int i = 0; i < number; i++) { + // STMT_TEMPLATE_LOCATION + } + CALLGRIND_TOGGLE_COLLECT; +} diff --git a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py index b8513671beb9..64c5d9793e80 100644 --- a/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py +++ b/torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py @@ -483,10 +483,13 @@ def collect_callgrind( task_spec: common.TaskSpec, globals: Dict[str, Any], number: int, - collect_baseline: bool + collect_baseline: bool, + is_python: bool, ) -> CallgrindStats: """Collect stats, and attach a reference run which can be used to filter interpreter overhead.""" self._validate() + assert is_python or not collect_baseline + baseline_inclusive_stats = FunctionCounts((), inclusive=True) baseline_exclusive_stats = FunctionCounts((), inclusive=False) if collect_baseline: @@ -496,15 +499,17 @@ def collect_callgrind( common.TaskSpec( stmt="pass", setup="pass", - num_threads=task_spec.num_threads + num_threads=task_spec.num_threads, ), globals={}, number=number, + is_python=True, ) baseline_inclusive_stats, baseline_exclusive_stats = \ self._baseline_cache[cache_key] - stmt_inclusive_stats, stmt_exclusive_stats = self._invoke(task_spec, globals, number) + stmt_inclusive_stats, stmt_exclusive_stats = self._invoke( + task_spec, globals, number, is_python) return CallgrindStats( task_spec=task_spec, number_per_run=number, @@ -520,6 +525,7 @@ def _invoke( task_spec: common.TaskSpec, globals: Dict[str, Any], number: int, + is_python: bool, ) -> Tuple[FunctionCounts, FunctionCounts]: """Core invocation method for Callgrind collection. @@ -565,20 +571,34 @@ def run(args: List[str], **kwargs: Any) -> Tuple[CompletedProcessType, str]: f_stdout_stderr.close() try: - if self._bindings_module is not None: - shutil.copy( - self._bindings_module.__file__, - os.path.join(working_dir, os.path.split(self._bindings_module.__file__)[1]) + if is_python: + if self._bindings_module is not None: + shutil.copy( + self._bindings_module.__file__, + os.path.join(working_dir, os.path.split(self._bindings_module.__file__)[1]) + ) + + script_file = os.path.join(working_dir, "timer_callgrind.py") + with open(script_file, "wt") as f: + f.write(self._construct_script( + task_spec, + globals=GlobalsBridge(globals, data_dir), + number=number, + error_log=error_log, + stat_log=stat_log, + bindings=self._bindings_module)) + run_loop_cmd = ["python", script_file] + else: + run_loop_exec = cpp_jit.compile_callgrind_template( + task_spec.stmt, + task_spec.setup, ) - - with open(script_file, "wt") as f: - f.write(self._construct_script( - task_spec, - globals=GlobalsBridge(globals, data_dir), - number=number, - error_log=error_log, - stat_log=stat_log, - bindings=self._bindings_module)) + run_loop_cmd = [ + run_loop_exec, + "--number", str(number), + "--number_warmup", str(min(number, 10)), + "--number_threads", str(task_spec.num_threads), + ] valgrind_invocation, valgrind_invocation_output = run([ "valgrind", @@ -588,9 +608,7 @@ def run(args: List[str], **kwargs: Any) -> Tuple[CompletedProcessType, str]: "--dump-instr=yes", "--instr-atstart=yes", "--collect-atstart=no", - "python", - script_file, - ]) + ] + run_loop_cmd) if valgrind_invocation.returncode: error_report = "" diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index 993b04ca23d8..22b20636d6c1 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -1393,6 +1393,9 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone): if not is_standalone: extra_ldflags.append('-ltorch_python') + if is_standalone and "TBB" in torch.__config__.parallel_info(): + extra_ldflags.append('-ltbb') + if is_standalone: extra_ldflags.append(f"-Wl,-rpath,{TORCH_LIB_PATH}")