pytorch · xuzhao9 · Aug 30, 2022 · Aug 30, 2022 · Aug 30, 2022 · Aug 30, 2022
diff --git a/.github/scripts/abtest.py b/.github/scripts/abtest.py
@@ -14,6 +14,7 @@
 with add_path(REPO_ROOT):
     import torchbenchmark.util.gitutils as gitutils
     from userbenchmark import list_userbenchmarks
+    from utils.cuda_utils import prepare_cuda_env, DEFAULT_CUDA_VERSION
 
 USERBENCHMARK_OUTPUT_PATH = os.path.join(REPO_ROOT, ".userbenchmark")
 # only preserve the first 10 chars of the git hash
@@ -28,14 +29,14 @@ def cleanup():
         subprocess.check_call(command, shell=False)
     print("done")
 
-def run_commit(repo_path: str, commit: str, bm_name: str, skip_build: bool=False) -> Path:
+def run_commit(repo_path: str, env: os._Environ, commit: str, bm_name: str, skip_build: bool=False) -> Path:
     "Run the userbenchmark on the commit. Return the metrics output file path."
     # build the pytorch commit if required
     if not skip_build:
         cleanup()
-        build_pytorch_commit(repo_path, commit)
+        build_pytorch_commit(repo_path, commit, cuda_env=env)
     # run_benchmark
-    return run_benchmark(bm_name)
+    return run_benchmark(bm_name, cuda_env=env)
 
 def validate_benchmark_output(bm_output: Path, bm_name: str):
     with open(bm_output, "r") as bmobj:
@@ -45,7 +46,7 @@ def validate_benchmark_output(bm_output: Path, bm_name: str):
         f"Missing pytorch git version in {bm_output}."
     assert "metrics" in output, f"Missing definition of metrics in {bm_output}."
 
-def run_benchmark(bm_name: str) -> Path:
+def run_benchmark(bm_name: str, cuda_env: os._Environ) -> Path:
     def find_latest_output(p: str) -> Optional[Path]:
         if not os.path.exists(p) or not os.path.isdir(p):
             return None
@@ -55,7 +56,7 @@ def find_latest_output(p: str) -> Optional[Path]:
         return json_files[-1]
     command = [sys.executable, "run_benchmark.py", bm_name]
     try:
-        subprocess.check_call(command, cwd=REPO_ROOT, shell=False)
+        subprocess.check_call(command, env=cuda_env, cwd=REPO_ROOT, shell=False)
     except subprocess.CalledProcessError as e:
         print(f"Failed to call userbenchmark {command}. Error: {e}")
         sys.exit(1)
@@ -78,7 +79,7 @@ def setup_build_env(env) -> Dict[str, str]:
     env["CMAKE_PREFIX_PATH"] = env["CONDA_PREFIX"]
     return env
 
-def build_pytorch_commit(repo_path: str, commit: str):
+def build_pytorch_commit(repo_path: str, commit: str, cuda_env: os._Environ):
     # checkout pytorch commit
     print(f"Checking out pytorch commit {commit} ...", end="", flush=True)
     if not gitutils.checkout_git_commit(repo_path, commit):
@@ -95,16 +96,17 @@ def build_pytorch_commit(repo_path: str, commit: str):
         # some packages are not included in the wheel, so use `develop`, not `install`
         command = ["python", "setup.py", "develop"]
         # setup environment variables
-        build_env = setup_build_env(os.environ.copy())
+        build_env = setup_build_env(cuda_env)
         subprocess.check_call(command, cwd=repo_path, env=build_env, shell=False)
-        command_testbuild = ["python", "-c", "'import torch'"]
-        subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env, shell=False)
     except subprocess.CalledProcessError:
         # If failed, remove the build directory, then try again
         build_path = os.path.join(repo_path, "build")
         if os.path.exists(build_path):
             shutil.rmtree(build_path)
         subprocess.check_call(command, cwd=repo_path, env=build_env, shell=False)
+    finally:
+        command_testbuild = ["python", "-c", "'import torch'"]
+        subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env, shell=False)
     print("done")
 
 def process_test_result(result_a: Path, result_b: Path, output_dir: str) -> str:
@@ -154,7 +156,9 @@ def validate_results(a, b) -> bool:
         assert Path(args.pytorch_repo).is_dir(), f"Specified PyTorch repo dir {args.pytorch_repo} doesn't exist."
         commits = gitutils.get_git_commits(args.pytorch_repo, args.base, args.head)
         assert commits, f"Can't find git commit {args.base} or {args.head} in repo {args.pytorch_repo}"
-    result_a = run_commit(args.pytorch_repo, args.base, args.userbenchmark, args.skip_build)
-    result_b = run_commit(args.pytorch_repo, args.head, args.userbenchmark, args.skip_build)
+    # setup cuda environment
+    cuda_env = prepare_cuda_env(cuda_version=DEFAULT_CUDA_VERSION)
+    result_a = run_commit(args.pytorch_repo, cuda_env, args.base, args.userbenchmark, args.skip_build)
+    result_b = run_commit(args.pytorch_repo, cuda_env, args.head, args.userbenchmark, args.skip_build)
     compare_result = process_test_result(result_a, result_b, args.output_dir)
     print(compare_result)
diff --git a/.github/scripts/run-config.py b/.github/scripts/run-config.py
@@ -16,25 +16,12 @@
 
 from bmutils import add_path
 from bmutils.summarize import analyze_result
+
 REPO_DIR = str(Path(__file__).parent.parent.parent.resolve())
 
 with add_path(REPO_DIR):
     from torchbenchmark import _list_model_paths
-
-CUDA_VERSION_MAP = {
-    "11.3": {
-        "pytorch_url": "cu113",
-        "magma_version": "magma-cuda113",
-    },
-    "11.6": {
-         "pytorch_url": "cu116",
-         "magma_version": "magma-cuda116",
-    },
-    "11.7": {
-         "pytorch_url": "cu117",
-         "magma_version": "magma-cuda117",
-    }
-}
+    from utils.cuda_utils import prepare_cuda_env, install_pytorch_nightly
 
 @dataclass
 class BenchmarkModelConfig:
@@ -130,43 +117,12 @@ def parse_bmconfigs(repo_path: Path, config_name: str) -> List[BenchmarkModelCon
 
 def prepare_bmconfig_env(config: BenchmarkModelConfig, repo_path: Path, dryrun=False):
     """Prepare the correct cuda version environment for the benchmarking."""
-    env = os.environ
     if not config.cuda_version:
-        return env
+        return os.environ.copy()
     cuda_version = config.cuda_version
-    # step 1: setup CUDA path and environment variables
-    cuda_path = Path("/").joinpath("usr", "local", f"cuda-{cuda_version}")
-    assert cuda_path.exists() and cuda_path.is_dir(), f"Expected CUDA Library path {cuda_path} doesn't exist."
-    env["CUDA_ROOT"] = str(cuda_path)
-    env["CUDA_HOME"] = str(cuda_path)
-    env["PATH"] = f"{str(cuda_path)}/bin:{env['PATH']}"
-    env["LD_LIBRARY_PATH"] = f"{str(cuda_path)}/lib64:{str(cuda_path)}/extras/CUPTI/lib64:{env['LD_LIBRARY_PATH']}"
-    # step 2: test call nvcc to confirm the version
-    test_nvcc = ["nvcc", "--version"]
-    if not dryrun:
-        subprocess.check_call(test_nvcc)
-    # step 1: uninstall all pytorch packages
-    uninstall_torch_cmd = ["pip", "uninstall", "-y", "torch", "torchvision", "torchtext"]
-    print(f"Uninstall pytorch: {uninstall_torch_cmd}")
-    if not dryrun:
-        for _loop in range(3):
-            subprocess.check_call(uninstall_torch_cmd)
-    # step 2: install pytorch nightly with the correct cuda version
-    install_magma_cmd = ["conda", "install", "-c", "pytorch", CUDA_VERSION_MAP[cuda_version]['magma_version']]
-    print(f"Install magma: {install_magma_cmd}")
-    if not dryrun:
-        subprocess.check_call(install_magma_cmd)
-    pytorch_nightly_url = f"https://download.pytorch.org/whl/nightly/{CUDA_VERSION_MAP[cuda_version]['pytorch_url']}/torch_nightly.html"
-    install_torch_cmd = ["pip", "install", "--pre", "torch", "torchvision", "torchtext", "-f",  pytorch_nightly_url]
-    print(f"Install pytorch nightly: {install_torch_cmd}")
-    if not dryrun:
-        subprocess.check_call(install_torch_cmd)
-    # step 3: install torchbench
-    install_torchbench_cmd = [sys.executable, "install.py"]
-    print(f"Install torchbench: {install_torchbench_cmd}")
-    if not dryrun:
-        subprocess.check_call(install_torchbench_cmd, cwd=repo_path)
-    return env
+    new_env = prepare_cuda_env(cuda_version=cuda_version)
+    install_pytorch_nightly(cuda_version=cuda_version, env=new_env, dryrun=dryrun)
+    return new_env
 
 def run_bmconfig(config: BenchmarkModelConfig, repo_path: Path, output_path: Path, dryrun=False):
     run_env = prepare_bmconfig_env(config, repo_path=repo_path, dryrun=dryrun)

diff --git a/bisection.py b/bisection.py
@@ -15,14 +15,14 @@
 import shutil
 import yaml
 import argparse
-import typing
 from tabulate import tabulate
 import re
 import subprocess
 from datetime import datetime
 from typing import Optional, List, Dict, Tuple
 
 from torchbenchmark.util import gitutils
+from utils.cuda_utils import prepare_cuda_env, DEFAULT_CUDA_VERSION
 
 TORCH_GITREPO="https://github.com/pytorch/pytorch.git"
 TORCHBENCH_GITREPO="https://github.com/pytorch/benchmark.git"
@@ -132,6 +132,7 @@ class TorchSource:
     srcpath: str
     build_lazy: bool
     commits: List[Commit]
+    build_env: os._Environ
     # Map from commit SHA to index in commits
     commit_dict: Dict[str, int]
     def __init__(self, srcpath: str, build_lazy: bool):
@@ -140,13 +141,14 @@ def __init__(self, srcpath: str, build_lazy: bool):
         self.commits = []
         self.commit_dict = dict()
 
-    def prep(self) -> bool:
+    def prep(self, build_env: os._Environ) -> bool:
         repo_origin_url = gitutils.get_git_origin(self.srcpath)
         if not repo_origin_url == TORCH_GITREPO:
             print(f"WARNING: Unmatched repo origin url: {repo_origin_url} with standard {TORCH_GITREPO}")
         self.update_repos()
         # Clean up the existing packages
         self.cleanup()
+        self.build_env = build_env
         return True
 
     # Update pytorch, torchtext, and torchvision repo
@@ -234,7 +236,7 @@ def build(self, commit: Commit):
         ctime = datetime.strptime(commit.ctime.split(" ")[0], "%Y-%m-%d")
         self.checkout_deps(ctime)
         # setup environment variables
-        build_env = self.setup_build_env(os.environ.copy())
+        build_env = self.setup_build_env(self.build_env)
         # build pytorch
         print(f"Building pytorch commit {commit.sha} ...", end="", flush=True)
         # Check if version.py exists, if it does, remove it.
@@ -275,6 +277,7 @@ class TorchBench:
     models: List[str]
     first_time: bool
     torch_src: TorchSource
+    bench_env: os._Environ
 
     def __init__(self, srcpath: str,
                  torch_src: TorchSource,
@@ -287,7 +290,8 @@ def __init__(self, srcpath: str,
         self.first_time = True
         self.models = list()
 
-    def prep(self) -> bool:
+    def prep(self, bench_env) -> bool:
+        self.bench_env = bench_env
         # Verify the code in srcpath is pytorch/benchmark
         repo_origin_url = gitutils.get_git_origin(self.srcpath)
         if not repo_origin_url == TORCHBENCH_GITREPO:
@@ -302,7 +306,7 @@ def prep(self) -> bool:
     def _install_benchmark(self):
         "Install and build TorchBench dependencies"
         command = ["python", "install.py"]
-        subprocess.check_call(command, cwd=self.srcpath, shell=False)
+        subprocess.check_call(command, cwd=self.srcpath, env=self.bench_env, shell=False)
 
     def run_benchmark(self, commit: Commit, targets: List[str]) -> str:
         # Return the result json file path
@@ -323,7 +327,7 @@ def run_benchmark(self, commit: Commit, targets: List[str]) -> str:
         print(f"Running TorchBench for commit: {commit.sha}, filter {bmfilter} ...", end="", flush=True)
         command = f"""bash .github/scripts/run.sh "{output_dir}" "{bmfilter}" 2>&1 | tee {output_dir}/benchmark.log"""
         try:
-            subprocess.check_call(command, cwd=self.srcpath, shell=True, timeout=self.timelimit * 60)
+            subprocess.check_call(command, cwd=self.srcpath, env=self.bench_env, shell=True, timeout=self.timelimit * 60)
         except subprocess.TimeoutExpired:
             print(f"Benchmark timeout for {commit.sha}. Result will be None.")
             return output_dir
@@ -461,11 +465,12 @@ def regression(self, left: Commit, right: Commit, targets: List[str]) -> List[st
         return out
 
     def prep(self) -> bool:
-        if not self.torch_src.prep():
+        base_build_env = prepare_cuda_env(cuda_version=DEFAULT_CUDA_VERSION)
+        if not self.torch_src.prep(base_build_env):
             return False
         if not self.torch_src.init_commits(self.start, self.end, self.abtest):
             return False
-        if not self.bench.prep():
+        if not self.bench.prep(base_build_env):
             return False
         left_commit = self.torch_src.commits[0]
         right_commit = self.torch_src.commits[-1]

diff --git a/utils/cuda_utils.py b/utils/cuda_utils.py
@@ -0,0 +1,71 @@
+import os
+import re
+import subprocess
+from pathlib import Path
+
+# defines the default CUDA version to compile against
+DEFAULT_CUDA_VERSION = "11.6"
+
+CUDA_VERSION_MAP = {
+    "11.3": {
+        "pytorch_url": "cu113",
+        "magma_version": "magma-cuda113",
+    },
+    "11.6": {
+         "pytorch_url": "cu116",
+         "magma_version": "magma-cuda116",
+    },
+    "11.7": {
+         "pytorch_url": "cu117",
+         "magma_version": "magma-cuda117",
+    }
+}
+
+def _nvcc_output_match(nvcc_output, target_cuda_version):
+    regex = 'release (.*),'
+    version = re.search(regex, nvcc_output).groups()[0]
+    return version == target_cuda_version
+
+def prepare_cuda_env(cuda_version: str, dryrun=False):
+    assert cuda_version in CUDA_VERSION_MAP, f"Required CUDA version {cuda_version} doesn't exist in {CUDA_VERSION_MAP.keys()}."
+    env = os.environ.copy()
+    # step 1: setup CUDA path and environment variables
+    cuda_path = Path("/").joinpath("usr", "local", f"cuda-{cuda_version}")
+    assert cuda_path.exists() and cuda_path.is_dir(), f"Expected CUDA Library path {cuda_path} doesn't exist."
+    cuda_path_str = str(cuda_path.resolve())
+    env["CUDA_ROOT"] = cuda_path_str
+    env["CUDA_HOME"] = cuda_path_str
+    env["PATH"] = f"{cuda_path_str}/bin:{env['PATH']}"
+    env["CMAKE_CUDA_COMPILER"] = str(cuda_path.joinpath('bin', 'nvcc').resolve())
+    env["LD_LIBRARY_PATH"] = f"{cuda_path_str}/lib64:{cuda_path_str}/extras/CUPTI/lib64:{env['LD_LIBRARY_PATH']}"
+    if dryrun:
+        print(f"CUDA_HOME is set to {env['CUDA_HOME']}")
+    # step 2: test call to nvcc to confirm the version is correct
+    test_nvcc = ["nvcc", "--version"]
+    if dryrun:
+        print(f"Checking nvcc version, command {test_nvcc}")
+    else:
+        output = subprocess.check_output(test_nvcc, stderr=subprocess.STDOUT, env=env).decode()
+        print(f"NVCC version output: {output}")
+        assert _nvcc_output_match(output, cuda_version), f"Expected CUDA version {cuda_version}, getting nvcc test result {output}"
+    # step 3: install the correct magma version
+    install_magma_cmd = ["conda", "install", "-c", "pytorch", CUDA_VERSION_MAP[cuda_version]['magma_version']]
+    if dryrun:
+        print(f"Installing CUDA magma: {install_magma_cmd}")
+    subprocess.check_call(install_magma_cmd, env=env)
+    return env
+
+def install_pytorch_nightly(cuda_version: str, env, dryrun=False):
+    uninstall_torch_cmd = ["pip", "uninstall", "-y", "torch", "torchvision", "torchtext"]
+    if dryrun:
+        print(f"Uninstall pytorch: {uninstall_torch_cmd}")
+    else:
+        # uninstall multiple times to make sure the env is clean
+        for _loop in range(3):
+            subprocess.check_call(uninstall_torch_cmd)
+    pytorch_nightly_url = f"https://download.pytorch.org/whl/nightly/{CUDA_VERSION_MAP[cuda_version]['pytorch_url']}/torch_nightly.html"
+    install_torch_cmd = ["pip", "install", "--pre", "torch", "torchvision", "torchtext", "-f",  pytorch_nightly_url]
+    if dryrun:
+        print(f"Install pytorch nightly: {install_torch_cmd}")
+    else:
+        subprocess.check_call(install_torch_cmd, env=env)