From de63a634ffe59f43aa7f23062cf50652906bbff7 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Mon, 29 Aug 2022 20:23:23 -0400 Subject: [PATCH 1/4] Working on setting up cuda env for compile --- .github/scripts/run-config.py | 56 ++++-------------------------- bisection.py | 4 +++ utils/cuda_utils.py | 64 +++++++++++++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 50 deletions(-) create mode 100644 utils/cuda_utils.py diff --git a/.github/scripts/run-config.py b/.github/scripts/run-config.py index 612ba74957..eb75ea2281 100644 --- a/.github/scripts/run-config.py +++ b/.github/scripts/run-config.py @@ -16,25 +16,12 @@ from bmutils import add_path from bmutils.summarize import analyze_result + REPO_DIR = str(Path(__file__).parent.parent.parent.resolve()) with add_path(REPO_DIR): from torchbenchmark import _list_model_paths - -CUDA_VERSION_MAP = { - "11.3": { - "pytorch_url": "cu113", - "magma_version": "magma-cuda113", - }, - "11.6": { - "pytorch_url": "cu116", - "magma_version": "magma-cuda116", - }, - "11.7": { - "pytorch_url": "cu117", - "magma_version": "magma-cuda117", - } -} + from utils.cuda_utils import prepare_cuda_env, install_pytorch_nightly @dataclass class BenchmarkModelConfig: @@ -130,43 +117,12 @@ def parse_bmconfigs(repo_path: Path, config_name: str) -> List[BenchmarkModelCon def prepare_bmconfig_env(config: BenchmarkModelConfig, repo_path: Path, dryrun=False): """Prepare the correct cuda version environment for the benchmarking.""" - env = os.environ if not config.cuda_version: - return env + return os.environ.copy() cuda_version = config.cuda_version - # step 1: setup CUDA path and environment variables - cuda_path = Path("/").joinpath("usr", "local", f"cuda-{cuda_version}") - assert cuda_path.exists() and cuda_path.is_dir(), f"Expected CUDA Library path {cuda_path} doesn't exist." - env["CUDA_ROOT"] = str(cuda_path) - env["CUDA_HOME"] = str(cuda_path) - env["PATH"] = f"{str(cuda_path)}/bin:{env['PATH']}" - env["LD_LIBRARY_PATH"] = f"{str(cuda_path)}/lib64:{str(cuda_path)}/extras/CUPTI/lib64:{env['LD_LIBRARY_PATH']}" - # step 2: test call nvcc to confirm the version - test_nvcc = ["nvcc", "--version"] - if not dryrun: - subprocess.check_call(test_nvcc) - # step 1: uninstall all pytorch packages - uninstall_torch_cmd = ["pip", "uninstall", "-y", "torch", "torchvision", "torchtext"] - print(f"Uninstall pytorch: {uninstall_torch_cmd}") - if not dryrun: - for _loop in range(3): - subprocess.check_call(uninstall_torch_cmd) - # step 2: install pytorch nightly with the correct cuda version - install_magma_cmd = ["conda", "install", "-c", "pytorch", CUDA_VERSION_MAP[cuda_version]['magma_version']] - print(f"Install magma: {install_magma_cmd}") - if not dryrun: - subprocess.check_call(install_magma_cmd) - pytorch_nightly_url = f"https://download.pytorch.org/whl/nightly/{CUDA_VERSION_MAP[cuda_version]['pytorch_url']}/torch_nightly.html" - install_torch_cmd = ["pip", "install", "--pre", "torch", "torchvision", "torchtext", "-f", pytorch_nightly_url] - print(f"Install pytorch nightly: {install_torch_cmd}") - if not dryrun: - subprocess.check_call(install_torch_cmd) - # step 3: install torchbench - install_torchbench_cmd = [sys.executable, "install.py"] - print(f"Install torchbench: {install_torchbench_cmd}") - if not dryrun: - subprocess.check_call(install_torchbench_cmd, cwd=repo_path) - return env + new_env = prepare_cuda_env(cuda_version=cuda_version) + install_pytorch_nightly(cuda_version=cuda_version, dryrun=dryrun) + return new_env def run_bmconfig(config: BenchmarkModelConfig, repo_path: Path, output_path: Path, dryrun=False): run_env = prepare_bmconfig_env(config, repo_path=repo_path, dryrun=dryrun) diff --git a/bisection.py b/bisection.py index e288a6b502..1692a49cc3 100644 --- a/bisection.py +++ b/bisection.py @@ -23,6 +23,10 @@ from typing import Optional, List, Dict, Tuple from torchbenchmark.util import gitutils +from utils.cuda_utils import prepare_cuda_env + +# defines the default CUDA version to compile against +DEFAULT_CUDA_VERSION = "11.6" TORCH_GITREPO="https://github.com/pytorch/pytorch.git" TORCHBENCH_GITREPO="https://github.com/pytorch/benchmark.git" diff --git a/utils/cuda_utils.py b/utils/cuda_utils.py new file mode 100644 index 0000000000..b243ef2c9d --- /dev/null +++ b/utils/cuda_utils.py @@ -0,0 +1,64 @@ +import os +import subprocess +from pathlib import Path + +CUDA_VERSION_MAP = { + "11.3": { + "pytorch_url": "cu113", + "magma_version": "magma-cuda113", + }, + "11.6": { + "pytorch_url": "cu116", + "magma_version": "magma-cuda116", + }, + "11.7": { + "pytorch_url": "cu117", + "magma_version": "magma-cuda117", + } +} + +def _nvcc_output_match(nvcc_output, target_cuda_version): + return False + +def prepare_cuda_env(cuda_version: str, dryrun=False): + assert cuda_version in CUDA_VERSION_MAP, f"Required CUDA version {cuda_version} doesn't exist in {CUDA_VERSION_MAP.keys()}." + env = os.environ.copy() + # step 1: setup CUDA path and environment variables + cuda_path = Path("/").joinpath("usr", "local", f"cuda-{cuda_version}") + assert cuda_path.exists() and cuda_path.is_dir(), f"Expected CUDA Library path {cuda_path} doesn't exist." + cuda_path_str = str(cuda_path.resolve()) + env["CUDA_ROOT"] = cuda_path_str + env["CUDA_HOME"] = cuda_path_str + env["PATH"] = f"{cuda_path_str}/bin:{env['PATH']}" + env["CMAKE_CUDA_COMPILER"] = str(cuda_path.joinpath('bin', 'nvcc').resolve()) + env["LD_LIBRARY_PATH"] = f"{cuda_path_str}/lib64:{cuda_path_str}/extras/CUPTI/lib64:{env['LD_LIBRARY_PATH']}" + if dryrun: + print(f"CUDA_HOME is set to {env['CUDA_HOME']}") + # step 2: test call to nvcc to confirm the version is correct + test_nvcc = ["nvcc", "--version"] + if dryrun: + print(f"Checking nvcc version, command {test_nvcc}") + else: + output = subprocess.check_output(test_nvcc) + assert _nvcc_output_match(output, cuda_version), f"Expected CUDA version {cuda_version}, getting nvcc test result {output}" + # step 3: install the correct magma version + install_magma_cmd = ["conda", "install", "-c", "pytorch", CUDA_VERSION_MAP[cuda_version]['magma_version']] + if dryrun: + print(f"Installing CUDA magma: {install_magma_cmd}") + subprocess.check_call(install_magma_cmd) + return env + +def install_pytorch_nightly(cuda_version: str, dryrun=False): + uninstall_torch_cmd = ["pip", "uninstall", "-y", "torch", "torchvision", "torchtext"] + if dryrun: + print(f"Uninstall pytorch: {uninstall_torch_cmd}") + else: + # uninstall multiple times to make sure the env is clean + for _loop in range(3): + subprocess.check_call(uninstall_torch_cmd) + pytorch_nightly_url = f"https://download.pytorch.org/whl/nightly/{CUDA_VERSION_MAP[cuda_version]['pytorch_url']}/torch_nightly.html" + install_torch_cmd = ["pip", "install", "--pre", "torch", "torchvision", "torchtext", "-f", pytorch_nightly_url] + if dryrun: + print(f"Install pytorch nightly: {install_torch_cmd}") + else: + subprocess.check_call(install_torch_cmd) From 3e039c3751704b96ad2990169319c1affcf77298 Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Tue, 30 Aug 2022 00:03:13 -0400 Subject: [PATCH 2/4] Support setting up CUDA versions in bisection script. --- .github/scripts/run-config.py | 2 +- bisection.py | 18 +++++++++++------- utils/cuda_utils.py | 13 ++++++++----- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/.github/scripts/run-config.py b/.github/scripts/run-config.py index eb75ea2281..fead2329d7 100644 --- a/.github/scripts/run-config.py +++ b/.github/scripts/run-config.py @@ -121,7 +121,7 @@ def prepare_bmconfig_env(config: BenchmarkModelConfig, repo_path: Path, dryrun=F return os.environ.copy() cuda_version = config.cuda_version new_env = prepare_cuda_env(cuda_version=cuda_version) - install_pytorch_nightly(cuda_version=cuda_version, dryrun=dryrun) + install_pytorch_nightly(cuda_version=cuda_version, env=new_env, dryrun=dryrun) return new_env def run_bmconfig(config: BenchmarkModelConfig, repo_path: Path, output_path: Path, dryrun=False): diff --git a/bisection.py b/bisection.py index 1692a49cc3..e0e1430dd8 100644 --- a/bisection.py +++ b/bisection.py @@ -15,7 +15,6 @@ import shutil import yaml import argparse -import typing from tabulate import tabulate import re import subprocess @@ -136,6 +135,7 @@ class TorchSource: srcpath: str build_lazy: bool commits: List[Commit] + build_env: os._Environ # Map from commit SHA to index in commits commit_dict: Dict[str, int] def __init__(self, srcpath: str, build_lazy: bool): @@ -144,13 +144,14 @@ def __init__(self, srcpath: str, build_lazy: bool): self.commits = [] self.commit_dict = dict() - def prep(self) -> bool: + def prep(self, build_env: os._Environ) -> bool: repo_origin_url = gitutils.get_git_origin(self.srcpath) if not repo_origin_url == TORCH_GITREPO: print(f"WARNING: Unmatched repo origin url: {repo_origin_url} with standard {TORCH_GITREPO}") self.update_repos() # Clean up the existing packages self.cleanup() + self.build_env = build_env return True # Update pytorch, torchtext, and torchvision repo @@ -238,7 +239,7 @@ def build(self, commit: Commit): ctime = datetime.strptime(commit.ctime.split(" ")[0], "%Y-%m-%d") self.checkout_deps(ctime) # setup environment variables - build_env = self.setup_build_env(os.environ.copy()) + build_env = self.setup_build_env(self.build_env) # build pytorch print(f"Building pytorch commit {commit.sha} ...", end="", flush=True) # Check if version.py exists, if it does, remove it. @@ -279,6 +280,7 @@ class TorchBench: models: List[str] first_time: bool torch_src: TorchSource + bench_env: os._Environ def __init__(self, srcpath: str, torch_src: TorchSource, @@ -291,7 +293,8 @@ def __init__(self, srcpath: str, self.first_time = True self.models = list() - def prep(self) -> bool: + def prep(self, bench_env) -> bool: + self.bench_env = bench_env # Verify the code in srcpath is pytorch/benchmark repo_origin_url = gitutils.get_git_origin(self.srcpath) if not repo_origin_url == TORCHBENCH_GITREPO: @@ -306,7 +309,7 @@ def prep(self) -> bool: def _install_benchmark(self): "Install and build TorchBench dependencies" command = ["python", "install.py"] - subprocess.check_call(command, cwd=self.srcpath, shell=False) + subprocess.check_call(command, cwd=self.srcpath, env=self.bench_env, shell=False) def run_benchmark(self, commit: Commit, targets: List[str]) -> str: # Return the result json file path @@ -327,7 +330,7 @@ def run_benchmark(self, commit: Commit, targets: List[str]) -> str: print(f"Running TorchBench for commit: {commit.sha}, filter {bmfilter} ...", end="", flush=True) command = f"""bash .github/scripts/run.sh "{output_dir}" "{bmfilter}" 2>&1 | tee {output_dir}/benchmark.log""" try: - subprocess.check_call(command, cwd=self.srcpath, shell=True, timeout=self.timelimit * 60) + subprocess.check_call(command, cwd=self.srcpath, env=self.bench_env, shell=True, timeout=self.timelimit * 60) except subprocess.TimeoutExpired: print(f"Benchmark timeout for {commit.sha}. Result will be None.") return output_dir @@ -465,7 +468,8 @@ def regression(self, left: Commit, right: Commit, targets: List[str]) -> List[st return out def prep(self) -> bool: - if not self.torch_src.prep(): + base_build_env = prepare_cuda_env(cuda_version=DEFAULT_CUDA_VERSION) + if not self.torch_src.prep(base_build_env): return False if not self.torch_src.init_commits(self.start, self.end, self.abtest): return False diff --git a/utils/cuda_utils.py b/utils/cuda_utils.py index b243ef2c9d..9fccebcf40 100644 --- a/utils/cuda_utils.py +++ b/utils/cuda_utils.py @@ -1,4 +1,5 @@ import os +import re import subprocess from pathlib import Path @@ -18,7 +19,9 @@ } def _nvcc_output_match(nvcc_output, target_cuda_version): - return False + regex = 'release (.*),' + version = re.search(regex, nvcc_output).groups()[0] + return version == target_cuda_version def prepare_cuda_env(cuda_version: str, dryrun=False): assert cuda_version in CUDA_VERSION_MAP, f"Required CUDA version {cuda_version} doesn't exist in {CUDA_VERSION_MAP.keys()}." @@ -39,16 +42,16 @@ def prepare_cuda_env(cuda_version: str, dryrun=False): if dryrun: print(f"Checking nvcc version, command {test_nvcc}") else: - output = subprocess.check_output(test_nvcc) + output = subprocess.check_output(test_nvcc, stderr=subprocess.STDOUT, env=env).decode() assert _nvcc_output_match(output, cuda_version), f"Expected CUDA version {cuda_version}, getting nvcc test result {output}" # step 3: install the correct magma version install_magma_cmd = ["conda", "install", "-c", "pytorch", CUDA_VERSION_MAP[cuda_version]['magma_version']] if dryrun: print(f"Installing CUDA magma: {install_magma_cmd}") - subprocess.check_call(install_magma_cmd) + subprocess.check_call(install_magma_cmd, env=env) return env -def install_pytorch_nightly(cuda_version: str, dryrun=False): +def install_pytorch_nightly(cuda_version: str, env, dryrun=False): uninstall_torch_cmd = ["pip", "uninstall", "-y", "torch", "torchvision", "torchtext"] if dryrun: print(f"Uninstall pytorch: {uninstall_torch_cmd}") @@ -61,4 +64,4 @@ def install_pytorch_nightly(cuda_version: str, dryrun=False): if dryrun: print(f"Install pytorch nightly: {install_torch_cmd}") else: - subprocess.check_call(install_torch_cmd) + subprocess.check_call(install_torch_cmd, env=env) From 6f72d99c4011379215291293d27cf47bb841383f Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Tue, 30 Aug 2022 00:18:03 -0400 Subject: [PATCH 3/4] Fix a small bug --- bisection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bisection.py b/bisection.py index e0e1430dd8..8e972245a8 100644 --- a/bisection.py +++ b/bisection.py @@ -473,7 +473,7 @@ def prep(self) -> bool: return False if not self.torch_src.init_commits(self.start, self.end, self.abtest): return False - if not self.bench.prep(): + if not self.bench.prep(base_build_env): return False left_commit = self.torch_src.commits[0] right_commit = self.torch_src.commits[-1] From 673f2640878dd9ecb199b5f8a267719dfd5d5f8e Mon Sep 17 00:00:00 2001 From: Xu Zhao Date: Tue, 30 Aug 2022 13:48:05 -0400 Subject: [PATCH 4/4] Add abtest run --- .github/scripts/abtest.py | 26 +++++++++++++++----------- bisection.py | 5 +---- utils/cuda_utils.py | 4 ++++ 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/.github/scripts/abtest.py b/.github/scripts/abtest.py index 86653785d1..285bdd5715 100644 --- a/.github/scripts/abtest.py +++ b/.github/scripts/abtest.py @@ -14,6 +14,7 @@ with add_path(REPO_ROOT): import torchbenchmark.util.gitutils as gitutils from userbenchmark import list_userbenchmarks + from utils.cuda_utils import prepare_cuda_env, DEFAULT_CUDA_VERSION USERBENCHMARK_OUTPUT_PATH = os.path.join(REPO_ROOT, ".userbenchmark") # only preserve the first 10 chars of the git hash @@ -28,14 +29,14 @@ def cleanup(): subprocess.check_call(command, shell=False) print("done") -def run_commit(repo_path: str, commit: str, bm_name: str, skip_build: bool=False) -> Path: +def run_commit(repo_path: str, env: os._Environ, commit: str, bm_name: str, skip_build: bool=False) -> Path: "Run the userbenchmark on the commit. Return the metrics output file path." # build the pytorch commit if required if not skip_build: cleanup() - build_pytorch_commit(repo_path, commit) + build_pytorch_commit(repo_path, commit, cuda_env=env) # run_benchmark - return run_benchmark(bm_name) + return run_benchmark(bm_name, cuda_env=env) def validate_benchmark_output(bm_output: Path, bm_name: str): with open(bm_output, "r") as bmobj: @@ -45,7 +46,7 @@ def validate_benchmark_output(bm_output: Path, bm_name: str): f"Missing pytorch git version in {bm_output}." assert "metrics" in output, f"Missing definition of metrics in {bm_output}." -def run_benchmark(bm_name: str) -> Path: +def run_benchmark(bm_name: str, cuda_env: os._Environ) -> Path: def find_latest_output(p: str) -> Optional[Path]: if not os.path.exists(p) or not os.path.isdir(p): return None @@ -55,7 +56,7 @@ def find_latest_output(p: str) -> Optional[Path]: return json_files[-1] command = [sys.executable, "run_benchmark.py", bm_name] try: - subprocess.check_call(command, cwd=REPO_ROOT, shell=False) + subprocess.check_call(command, env=cuda_env, cwd=REPO_ROOT, shell=False) except subprocess.CalledProcessError as e: print(f"Failed to call userbenchmark {command}. Error: {e}") sys.exit(1) @@ -78,7 +79,7 @@ def setup_build_env(env) -> Dict[str, str]: env["CMAKE_PREFIX_PATH"] = env["CONDA_PREFIX"] return env -def build_pytorch_commit(repo_path: str, commit: str): +def build_pytorch_commit(repo_path: str, commit: str, cuda_env: os._Environ): # checkout pytorch commit print(f"Checking out pytorch commit {commit} ...", end="", flush=True) if not gitutils.checkout_git_commit(repo_path, commit): @@ -95,16 +96,17 @@ def build_pytorch_commit(repo_path: str, commit: str): # some packages are not included in the wheel, so use `develop`, not `install` command = ["python", "setup.py", "develop"] # setup environment variables - build_env = setup_build_env(os.environ.copy()) + build_env = setup_build_env(cuda_env) subprocess.check_call(command, cwd=repo_path, env=build_env, shell=False) - command_testbuild = ["python", "-c", "'import torch'"] - subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env, shell=False) except subprocess.CalledProcessError: # If failed, remove the build directory, then try again build_path = os.path.join(repo_path, "build") if os.path.exists(build_path): shutil.rmtree(build_path) subprocess.check_call(command, cwd=repo_path, env=build_env, shell=False) + finally: + command_testbuild = ["python", "-c", "'import torch'"] + subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env, shell=False) print("done") def process_test_result(result_a: Path, result_b: Path, output_dir: str) -> str: @@ -154,7 +156,9 @@ def validate_results(a, b) -> bool: assert Path(args.pytorch_repo).is_dir(), f"Specified PyTorch repo dir {args.pytorch_repo} doesn't exist." commits = gitutils.get_git_commits(args.pytorch_repo, args.base, args.head) assert commits, f"Can't find git commit {args.base} or {args.head} in repo {args.pytorch_repo}" - result_a = run_commit(args.pytorch_repo, args.base, args.userbenchmark, args.skip_build) - result_b = run_commit(args.pytorch_repo, args.head, args.userbenchmark, args.skip_build) + # setup cuda environment + cuda_env = prepare_cuda_env(cuda_version=DEFAULT_CUDA_VERSION) + result_a = run_commit(args.pytorch_repo, cuda_env, args.base, args.userbenchmark, args.skip_build) + result_b = run_commit(args.pytorch_repo, cuda_env, args.head, args.userbenchmark, args.skip_build) compare_result = process_test_result(result_a, result_b, args.output_dir) print(compare_result) diff --git a/bisection.py b/bisection.py index 8e972245a8..79ca7487d2 100644 --- a/bisection.py +++ b/bisection.py @@ -22,10 +22,7 @@ from typing import Optional, List, Dict, Tuple from torchbenchmark.util import gitutils -from utils.cuda_utils import prepare_cuda_env - -# defines the default CUDA version to compile against -DEFAULT_CUDA_VERSION = "11.6" +from utils.cuda_utils import prepare_cuda_env, DEFAULT_CUDA_VERSION TORCH_GITREPO="https://github.com/pytorch/pytorch.git" TORCHBENCH_GITREPO="https://github.com/pytorch/benchmark.git" diff --git a/utils/cuda_utils.py b/utils/cuda_utils.py index 9fccebcf40..7ded084ce7 100644 --- a/utils/cuda_utils.py +++ b/utils/cuda_utils.py @@ -3,6 +3,9 @@ import subprocess from pathlib import Path +# defines the default CUDA version to compile against +DEFAULT_CUDA_VERSION = "11.6" + CUDA_VERSION_MAP = { "11.3": { "pytorch_url": "cu113", @@ -43,6 +46,7 @@ def prepare_cuda_env(cuda_version: str, dryrun=False): print(f"Checking nvcc version, command {test_nvcc}") else: output = subprocess.check_output(test_nvcc, stderr=subprocess.STDOUT, env=env).decode() + print(f"NVCC version output: {output}") assert _nvcc_output_match(output, cuda_version), f"Expected CUDA version {cuda_version}, getting nvcc test result {output}" # step 3: install the correct magma version install_magma_cmd = ["conda", "install", "-c", "pytorch", CUDA_VERSION_MAP[cuda_version]['magma_version']]