Skip to content

Commit

Permalink
Revert "Revert D25199264: Enable callgrind collection for C++ snippet…
Browse files Browse the repository at this point in the history
…s" (#48720)

Summary:
Pull Request resolved: #48720

This reverts commit 6646ff1.

Test Plan: Imported from OSS

Reviewed By: malfet

Differential Revision: D25273994

Pulled By: malfet

fbshipit-source-id: 61743176dc650136622e1b8f2384bbfbd7a46294
  • Loading branch information
Taylor Robie authored and facebook-github-bot committed Dec 2, 2020
1 parent b2ec21a commit 022c929
Show file tree
Hide file tree
Showing 6 changed files with 133 additions and 25 deletions.
29 changes: 29 additions & 0 deletions test/benchmark_utils/test_benchmark_utils.py
Expand Up @@ -2,6 +2,7 @@
import os
import re
import textwrap
import timeit
from typing import Any, List, Tuple
import unittest

Expand Down Expand Up @@ -168,6 +169,7 @@ def test_cpp_timer(self):
timer = benchmark_utils.Timer(
"torch::Tensor y = x + 1;",
setup="torch::Tensor x = torch::empty({1});",
timer=timeit.default_timer,
language=benchmark_utils.Language.CPP,
)
t = timer.timeit(10)
Expand Down Expand Up @@ -449,6 +451,7 @@ class MockCudaTimer(benchmark_utils.Timer):

@slowTest
@unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.")
@unittest.skipIf(IS_SANDCASTLE, "Valgrind is OSS only.")
def test_collect_callgrind(self):
with self.assertRaisesRegex(
ValueError,
Expand Down Expand Up @@ -505,6 +508,32 @@ def add_one(x):
"JIT'd bindings are only for back testing."
)

@slowTest
@unittest.skipIf(IS_WINDOWS, "Valgrind is not supported on Windows.")
@unittest.skipIf(IS_SANDCASTLE, "Valgrind is OSS only.")
def test_collect_cpp_callgrind(self):
timer = benchmark_utils.Timer(
"x += 1;",
setup="torch::Tensor x = torch::ones({1});",
timer=timeit.default_timer,
language="c++",
)
stats = [
timer.collect_callgrind()
for _ in range(3)
]
counts = [s.counts() for s in stats]

self.assertGreater(
min(counts), 0, "No stats were collected")
self.assertEqual(
min(counts), max(counts), "C++ Callgrind should be deterministic")

for s in stats:
self.assertEqual(
s.counts(denoise=True), s.counts(denoise=False),
"De-noising should not apply to C++.")

def test_manipulate_callgrind_stats(self):
stats_no_data, stats_with_data = load_callgrind_artifacts()

Expand Down
15 changes: 13 additions & 2 deletions torch/utils/benchmark/utils/cpp_jit.py
Expand Up @@ -3,6 +3,7 @@
import os
import re
import shutil
import tempfile
import textwrap
import threading
import uuid
Expand Down Expand Up @@ -30,8 +31,8 @@
# `setup` and `stmt` do not change, so we can reuse the executable from the
# first pass through the loop.
BUILD_ROOT = os.path.join(
torch._appdirs.user_cache_dir(appname="benchmark_utils_jit"),
f"build_{uuid.uuid4()}".replace("-", "")
tempfile.gettempdir(),
f"benchmark_utils_jit_build_{uuid.uuid4()}".replace("-", "")
)

# BACK_TESTING_NOTE:
Expand Down Expand Up @@ -141,3 +142,13 @@ def compile_timeit_template(stmt: str, setup: str) -> TimeitModuleType:
module = _compile_template(stmt, setup, src, is_standalone=False)
assert isinstance(module, TimeitModuleType)
return module


def compile_callgrind_template(stmt: str, setup: str) -> str:
template_path: str = os.path.join(SOURCE_ROOT, "valgrind_wrapper", "timer_callgrind_template.cpp")
with open(template_path, "rt") as f:
src: str = f.read()

target = _compile_template(stmt, setup, src, is_standalone=True)
assert isinstance(target, str)
return target
8 changes: 4 additions & 4 deletions torch/utils/benchmark/utils/timer.py
Expand Up @@ -421,15 +421,15 @@ def collect_callgrind(
if not isinstance(self._task_spec.stmt, str):
raise ValueError("`collect_callgrind` currently only supports string `stmt`")

if self._language != Language.PYTHON:
raise NotImplementedError("C++ Callgrind is later in the stack.")

# Check that the statement is valid. It doesn't guarantee success, but it's much
# simpler and quicker to raise an exception for a faulty `stmt` or `setup` in
# the parent process rather than the valgrind subprocess.
self._timer.timeit(1)
is_python = (self._language == Language.PYTHON)
assert is_python or not self._globals
return valgrind_timer_interface.wrapper_singleton().collect_callgrind(
task_spec=self._task_spec,
globals=self._globals,
number=number,
collect_baseline=collect_baseline)
collect_baseline=collect_baseline and is_python,
is_python=is_python)
@@ -0,0 +1,47 @@
/* C++ template for Timer.collect_callgrind
This template will be consumed by `cpp_jit.py`, and will replace:
`SETUP_TEMPLATE_LOCATION`
and
`STMT_TEMPLATE_LOCATION`
sections with user provided statements.
*/

#include <string>

#include <callgrind.h>
#include <torch/torch.h>

#if defined(NVALGRIND)
static_assert(false);
#endif

int main(int argc, char* argv[]) {
// This file should only be called inside of `Timer`, so we can adopt a
// very simple and rigid argument parsing scheme.
TORCH_CHECK(argc == 7);
TORCH_CHECK(std::string(argv[1]) == "--number");
auto number = std::stoi(argv[2]);

TORCH_CHECK(std::string(argv[3]) == "--number_warmup");
auto number_warmup = std::stoi(argv[4]);

TORCH_CHECK(std::string(argv[5]) == "--number_threads");
auto number_threads = std::stoi(argv[6]);
torch::set_num_threads(number_threads);

// Setup
// SETUP_TEMPLATE_LOCATION

// Warmup
for (int i = 0; i < number_warmup; i++) {
// STMT_TEMPLATE_LOCATION
}

// Main loop
CALLGRIND_TOGGLE_COLLECT;
for (int i = 0; i < number; i++) {
// STMT_TEMPLATE_LOCATION
}
CALLGRIND_TOGGLE_COLLECT;
}
56 changes: 37 additions & 19 deletions torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py
Expand Up @@ -483,10 +483,13 @@ def collect_callgrind(
task_spec: common.TaskSpec,
globals: Dict[str, Any],
number: int,
collect_baseline: bool
collect_baseline: bool,
is_python: bool,
) -> CallgrindStats:
"""Collect stats, and attach a reference run which can be used to filter interpreter overhead."""
self._validate()
assert is_python or not collect_baseline

baseline_inclusive_stats = FunctionCounts((), inclusive=True)
baseline_exclusive_stats = FunctionCounts((), inclusive=False)
if collect_baseline:
Expand All @@ -496,15 +499,17 @@ def collect_callgrind(
common.TaskSpec(
stmt="pass",
setup="pass",
num_threads=task_spec.num_threads
num_threads=task_spec.num_threads,
),
globals={},
number=number,
is_python=True,
)
baseline_inclusive_stats, baseline_exclusive_stats = \
self._baseline_cache[cache_key]

stmt_inclusive_stats, stmt_exclusive_stats = self._invoke(task_spec, globals, number)
stmt_inclusive_stats, stmt_exclusive_stats = self._invoke(
task_spec, globals, number, is_python)
return CallgrindStats(
task_spec=task_spec,
number_per_run=number,
Expand All @@ -520,6 +525,7 @@ def _invoke(
task_spec: common.TaskSpec,
globals: Dict[str, Any],
number: int,
is_python: bool,
) -> Tuple[FunctionCounts, FunctionCounts]:
"""Core invocation method for Callgrind collection.
Expand Down Expand Up @@ -565,20 +571,34 @@ def run(args: List[str], **kwargs: Any) -> Tuple[CompletedProcessType, str]:
f_stdout_stderr.close()

try:
if self._bindings_module is not None:
shutil.copy(
self._bindings_module.__file__,
os.path.join(working_dir, os.path.split(self._bindings_module.__file__)[1])
if is_python:
if self._bindings_module is not None:
shutil.copy(
self._bindings_module.__file__,
os.path.join(working_dir, os.path.split(self._bindings_module.__file__)[1])
)

script_file = os.path.join(working_dir, "timer_callgrind.py")
with open(script_file, "wt") as f:
f.write(self._construct_script(
task_spec,
globals=GlobalsBridge(globals, data_dir),
number=number,
error_log=error_log,
stat_log=stat_log,
bindings=self._bindings_module))
run_loop_cmd = ["python", script_file]
else:
run_loop_exec = cpp_jit.compile_callgrind_template(
task_spec.stmt,
task_spec.setup,
)

with open(script_file, "wt") as f:
f.write(self._construct_script(
task_spec,
globals=GlobalsBridge(globals, data_dir),
number=number,
error_log=error_log,
stat_log=stat_log,
bindings=self._bindings_module))
run_loop_cmd = [
run_loop_exec,
"--number", str(number),
"--number_warmup", str(min(number, 10)),
"--number_threads", str(task_spec.num_threads),
]

valgrind_invocation, valgrind_invocation_output = run([
"valgrind",
Expand All @@ -588,9 +608,7 @@ def run(args: List[str], **kwargs: Any) -> Tuple[CompletedProcessType, str]:
"--dump-instr=yes",
"--instr-atstart=yes",
"--collect-atstart=no",
"python",
script_file,
])
] + run_loop_cmd)

if valgrind_invocation.returncode:
error_report = ""
Expand Down
3 changes: 3 additions & 0 deletions torch/utils/cpp_extension.py
Expand Up @@ -1393,6 +1393,9 @@ def _prepare_ldflags(extra_ldflags, with_cuda, verbose, is_standalone):
if not is_standalone:
extra_ldflags.append('-ltorch_python')

if is_standalone and "TBB" in torch.__config__.parallel_info():
extra_ldflags.append('-ltbb')

if is_standalone:
extra_ldflags.append(f"-Wl,-rpath,{TORCH_LIB_PATH}")

Expand Down

0 comments on commit 022c929

Please sign in to comment.