Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 15 additions & 11 deletions .github/scripts/abtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
with add_path(REPO_ROOT):
import torchbenchmark.util.gitutils as gitutils
from userbenchmark import list_userbenchmarks
from utils.cuda_utils import prepare_cuda_env, DEFAULT_CUDA_VERSION

USERBENCHMARK_OUTPUT_PATH = os.path.join(REPO_ROOT, ".userbenchmark")
# only preserve the first 10 chars of the git hash
Expand All @@ -28,14 +29,14 @@ def cleanup():
subprocess.check_call(command, shell=False)
print("done")

def run_commit(repo_path: str, commit: str, bm_name: str, skip_build: bool=False) -> Path:
def run_commit(repo_path: str, env: os._Environ, commit: str, bm_name: str, skip_build: bool=False) -> Path:
"Run the userbenchmark on the commit. Return the metrics output file path."
# build the pytorch commit if required
if not skip_build:
cleanup()
build_pytorch_commit(repo_path, commit)
build_pytorch_commit(repo_path, commit, cuda_env=env)
# run_benchmark
return run_benchmark(bm_name)
return run_benchmark(bm_name, cuda_env=env)

def validate_benchmark_output(bm_output: Path, bm_name: str):
with open(bm_output, "r") as bmobj:
Expand All @@ -45,7 +46,7 @@ def validate_benchmark_output(bm_output: Path, bm_name: str):
f"Missing pytorch git version in {bm_output}."
assert "metrics" in output, f"Missing definition of metrics in {bm_output}."

def run_benchmark(bm_name: str) -> Path:
def run_benchmark(bm_name: str, cuda_env: os._Environ) -> Path:
def find_latest_output(p: str) -> Optional[Path]:
if not os.path.exists(p) or not os.path.isdir(p):
return None
Expand All @@ -55,7 +56,7 @@ def find_latest_output(p: str) -> Optional[Path]:
return json_files[-1]
command = [sys.executable, "run_benchmark.py", bm_name]
try:
subprocess.check_call(command, cwd=REPO_ROOT, shell=False)
subprocess.check_call(command, env=cuda_env, cwd=REPO_ROOT, shell=False)
except subprocess.CalledProcessError as e:
print(f"Failed to call userbenchmark {command}. Error: {e}")
sys.exit(1)
Expand All @@ -78,7 +79,7 @@ def setup_build_env(env) -> Dict[str, str]:
env["CMAKE_PREFIX_PATH"] = env["CONDA_PREFIX"]
return env

def build_pytorch_commit(repo_path: str, commit: str):
def build_pytorch_commit(repo_path: str, commit: str, cuda_env: os._Environ):
# checkout pytorch commit
print(f"Checking out pytorch commit {commit} ...", end="", flush=True)
if not gitutils.checkout_git_commit(repo_path, commit):
Expand All @@ -95,16 +96,17 @@ def build_pytorch_commit(repo_path: str, commit: str):
# some packages are not included in the wheel, so use `develop`, not `install`
command = ["python", "setup.py", "develop"]
# setup environment variables
build_env = setup_build_env(os.environ.copy())
build_env = setup_build_env(cuda_env)
subprocess.check_call(command, cwd=repo_path, env=build_env, shell=False)
command_testbuild = ["python", "-c", "'import torch'"]
subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env, shell=False)
except subprocess.CalledProcessError:
# If failed, remove the build directory, then try again
build_path = os.path.join(repo_path, "build")
if os.path.exists(build_path):
shutil.rmtree(build_path)
subprocess.check_call(command, cwd=repo_path, env=build_env, shell=False)
finally:
command_testbuild = ["python", "-c", "'import torch'"]
subprocess.check_call(command_testbuild, cwd=os.environ["HOME"], env=build_env, shell=False)
print("done")

def process_test_result(result_a: Path, result_b: Path, output_dir: str) -> str:
Expand Down Expand Up @@ -154,7 +156,9 @@ def validate_results(a, b) -> bool:
assert Path(args.pytorch_repo).is_dir(), f"Specified PyTorch repo dir {args.pytorch_repo} doesn't exist."
commits = gitutils.get_git_commits(args.pytorch_repo, args.base, args.head)
assert commits, f"Can't find git commit {args.base} or {args.head} in repo {args.pytorch_repo}"
result_a = run_commit(args.pytorch_repo, args.base, args.userbenchmark, args.skip_build)
result_b = run_commit(args.pytorch_repo, args.head, args.userbenchmark, args.skip_build)
# setup cuda environment
cuda_env = prepare_cuda_env(cuda_version=DEFAULT_CUDA_VERSION)
result_a = run_commit(args.pytorch_repo, cuda_env, args.base, args.userbenchmark, args.skip_build)
result_b = run_commit(args.pytorch_repo, cuda_env, args.head, args.userbenchmark, args.skip_build)
compare_result = process_test_result(result_a, result_b, args.output_dir)
print(compare_result)
56 changes: 6 additions & 50 deletions .github/scripts/run-config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,12 @@

from bmutils import add_path
from bmutils.summarize import analyze_result

REPO_DIR = str(Path(__file__).parent.parent.parent.resolve())

with add_path(REPO_DIR):
from torchbenchmark import _list_model_paths

CUDA_VERSION_MAP = {
"11.3": {
"pytorch_url": "cu113",
"magma_version": "magma-cuda113",
},
"11.6": {
"pytorch_url": "cu116",
"magma_version": "magma-cuda116",
},
"11.7": {
"pytorch_url": "cu117",
"magma_version": "magma-cuda117",
}
}
from utils.cuda_utils import prepare_cuda_env, install_pytorch_nightly

@dataclass
class BenchmarkModelConfig:
Expand Down Expand Up @@ -130,43 +117,12 @@ def parse_bmconfigs(repo_path: Path, config_name: str) -> List[BenchmarkModelCon

def prepare_bmconfig_env(config: BenchmarkModelConfig, repo_path: Path, dryrun=False):
"""Prepare the correct cuda version environment for the benchmarking."""
env = os.environ
if not config.cuda_version:
return env
return os.environ.copy()
cuda_version = config.cuda_version
# step 1: setup CUDA path and environment variables
cuda_path = Path("/").joinpath("usr", "local", f"cuda-{cuda_version}")
assert cuda_path.exists() and cuda_path.is_dir(), f"Expected CUDA Library path {cuda_path} doesn't exist."
env["CUDA_ROOT"] = str(cuda_path)
env["CUDA_HOME"] = str(cuda_path)
env["PATH"] = f"{str(cuda_path)}/bin:{env['PATH']}"
env["LD_LIBRARY_PATH"] = f"{str(cuda_path)}/lib64:{str(cuda_path)}/extras/CUPTI/lib64:{env['LD_LIBRARY_PATH']}"
# step 2: test call nvcc to confirm the version
test_nvcc = ["nvcc", "--version"]
if not dryrun:
subprocess.check_call(test_nvcc)
# step 1: uninstall all pytorch packages
uninstall_torch_cmd = ["pip", "uninstall", "-y", "torch", "torchvision", "torchtext"]
print(f"Uninstall pytorch: {uninstall_torch_cmd}")
if not dryrun:
for _loop in range(3):
subprocess.check_call(uninstall_torch_cmd)
# step 2: install pytorch nightly with the correct cuda version
install_magma_cmd = ["conda", "install", "-c", "pytorch", CUDA_VERSION_MAP[cuda_version]['magma_version']]
print(f"Install magma: {install_magma_cmd}")
if not dryrun:
subprocess.check_call(install_magma_cmd)
pytorch_nightly_url = f"https://download.pytorch.org/whl/nightly/{CUDA_VERSION_MAP[cuda_version]['pytorch_url']}/torch_nightly.html"
install_torch_cmd = ["pip", "install", "--pre", "torch", "torchvision", "torchtext", "-f", pytorch_nightly_url]
print(f"Install pytorch nightly: {install_torch_cmd}")
if not dryrun:
subprocess.check_call(install_torch_cmd)
# step 3: install torchbench
install_torchbench_cmd = [sys.executable, "install.py"]
print(f"Install torchbench: {install_torchbench_cmd}")
if not dryrun:
subprocess.check_call(install_torchbench_cmd, cwd=repo_path)
return env
new_env = prepare_cuda_env(cuda_version=cuda_version)
install_pytorch_nightly(cuda_version=cuda_version, env=new_env, dryrun=dryrun)
return new_env

def run_bmconfig(config: BenchmarkModelConfig, repo_path: Path, output_path: Path, dryrun=False):
run_env = prepare_bmconfig_env(config, repo_path=repo_path, dryrun=dryrun)
Expand Down
21 changes: 13 additions & 8 deletions bisection.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@
import shutil
import yaml
import argparse
import typing
from tabulate import tabulate
import re
import subprocess
from datetime import datetime
from typing import Optional, List, Dict, Tuple

from torchbenchmark.util import gitutils
from utils.cuda_utils import prepare_cuda_env, DEFAULT_CUDA_VERSION

TORCH_GITREPO="https://github.com/pytorch/pytorch.git"
TORCHBENCH_GITREPO="https://github.com/pytorch/benchmark.git"
Expand Down Expand Up @@ -132,6 +132,7 @@ class TorchSource:
srcpath: str
build_lazy: bool
commits: List[Commit]
build_env: os._Environ
# Map from commit SHA to index in commits
commit_dict: Dict[str, int]
def __init__(self, srcpath: str, build_lazy: bool):
Expand All @@ -140,13 +141,14 @@ def __init__(self, srcpath: str, build_lazy: bool):
self.commits = []
self.commit_dict = dict()

def prep(self) -> bool:
def prep(self, build_env: os._Environ) -> bool:
repo_origin_url = gitutils.get_git_origin(self.srcpath)
if not repo_origin_url == TORCH_GITREPO:
print(f"WARNING: Unmatched repo origin url: {repo_origin_url} with standard {TORCH_GITREPO}")
self.update_repos()
# Clean up the existing packages
self.cleanup()
self.build_env = build_env
return True

# Update pytorch, torchtext, and torchvision repo
Expand Down Expand Up @@ -234,7 +236,7 @@ def build(self, commit: Commit):
ctime = datetime.strptime(commit.ctime.split(" ")[0], "%Y-%m-%d")
self.checkout_deps(ctime)
# setup environment variables
build_env = self.setup_build_env(os.environ.copy())
build_env = self.setup_build_env(self.build_env)
# build pytorch
print(f"Building pytorch commit {commit.sha} ...", end="", flush=True)
# Check if version.py exists, if it does, remove it.
Expand Down Expand Up @@ -275,6 +277,7 @@ class TorchBench:
models: List[str]
first_time: bool
torch_src: TorchSource
bench_env: os._Environ

def __init__(self, srcpath: str,
torch_src: TorchSource,
Expand All @@ -287,7 +290,8 @@ def __init__(self, srcpath: str,
self.first_time = True
self.models = list()

def prep(self) -> bool:
def prep(self, bench_env) -> bool:
self.bench_env = bench_env
# Verify the code in srcpath is pytorch/benchmark
repo_origin_url = gitutils.get_git_origin(self.srcpath)
if not repo_origin_url == TORCHBENCH_GITREPO:
Expand All @@ -302,7 +306,7 @@ def prep(self) -> bool:
def _install_benchmark(self):
"Install and build TorchBench dependencies"
command = ["python", "install.py"]
subprocess.check_call(command, cwd=self.srcpath, shell=False)
subprocess.check_call(command, cwd=self.srcpath, env=self.bench_env, shell=False)

def run_benchmark(self, commit: Commit, targets: List[str]) -> str:
# Return the result json file path
Expand All @@ -323,7 +327,7 @@ def run_benchmark(self, commit: Commit, targets: List[str]) -> str:
print(f"Running TorchBench for commit: {commit.sha}, filter {bmfilter} ...", end="", flush=True)
command = f"""bash .github/scripts/run.sh "{output_dir}" "{bmfilter}" 2>&1 | tee {output_dir}/benchmark.log"""
try:
subprocess.check_call(command, cwd=self.srcpath, shell=True, timeout=self.timelimit * 60)
subprocess.check_call(command, cwd=self.srcpath, env=self.bench_env, shell=True, timeout=self.timelimit * 60)
except subprocess.TimeoutExpired:
print(f"Benchmark timeout for {commit.sha}. Result will be None.")
return output_dir
Expand Down Expand Up @@ -461,11 +465,12 @@ def regression(self, left: Commit, right: Commit, targets: List[str]) -> List[st
return out

def prep(self) -> bool:
if not self.torch_src.prep():
base_build_env = prepare_cuda_env(cuda_version=DEFAULT_CUDA_VERSION)
if not self.torch_src.prep(base_build_env):
return False
if not self.torch_src.init_commits(self.start, self.end, self.abtest):
return False
if not self.bench.prep():
if not self.bench.prep(base_build_env):
return False
left_commit = self.torch_src.commits[0]
right_commit = self.torch_src.commits[-1]
Expand Down
71 changes: 71 additions & 0 deletions utils/cuda_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os
import re
import subprocess
from pathlib import Path

# defines the default CUDA version to compile against
DEFAULT_CUDA_VERSION = "11.6"

CUDA_VERSION_MAP = {
"11.3": {
"pytorch_url": "cu113",
"magma_version": "magma-cuda113",
},
"11.6": {
"pytorch_url": "cu116",
"magma_version": "magma-cuda116",
},
"11.7": {
"pytorch_url": "cu117",
"magma_version": "magma-cuda117",
}
}

def _nvcc_output_match(nvcc_output, target_cuda_version):
regex = 'release (.*),'
version = re.search(regex, nvcc_output).groups()[0]
return version == target_cuda_version

def prepare_cuda_env(cuda_version: str, dryrun=False):
assert cuda_version in CUDA_VERSION_MAP, f"Required CUDA version {cuda_version} doesn't exist in {CUDA_VERSION_MAP.keys()}."
env = os.environ.copy()
# step 1: setup CUDA path and environment variables
cuda_path = Path("/").joinpath("usr", "local", f"cuda-{cuda_version}")
assert cuda_path.exists() and cuda_path.is_dir(), f"Expected CUDA Library path {cuda_path} doesn't exist."
cuda_path_str = str(cuda_path.resolve())
env["CUDA_ROOT"] = cuda_path_str
env["CUDA_HOME"] = cuda_path_str
env["PATH"] = f"{cuda_path_str}/bin:{env['PATH']}"
env["CMAKE_CUDA_COMPILER"] = str(cuda_path.joinpath('bin', 'nvcc').resolve())
env["LD_LIBRARY_PATH"] = f"{cuda_path_str}/lib64:{cuda_path_str}/extras/CUPTI/lib64:{env['LD_LIBRARY_PATH']}"
if dryrun:
print(f"CUDA_HOME is set to {env['CUDA_HOME']}")
# step 2: test call to nvcc to confirm the version is correct
test_nvcc = ["nvcc", "--version"]
if dryrun:
print(f"Checking nvcc version, command {test_nvcc}")
else:
output = subprocess.check_output(test_nvcc, stderr=subprocess.STDOUT, env=env).decode()
print(f"NVCC version output: {output}")
assert _nvcc_output_match(output, cuda_version), f"Expected CUDA version {cuda_version}, getting nvcc test result {output}"
# step 3: install the correct magma version
install_magma_cmd = ["conda", "install", "-c", "pytorch", CUDA_VERSION_MAP[cuda_version]['magma_version']]
if dryrun:
print(f"Installing CUDA magma: {install_magma_cmd}")
subprocess.check_call(install_magma_cmd, env=env)
return env

def install_pytorch_nightly(cuda_version: str, env, dryrun=False):
uninstall_torch_cmd = ["pip", "uninstall", "-y", "torch", "torchvision", "torchtext"]
if dryrun:
print(f"Uninstall pytorch: {uninstall_torch_cmd}")
else:
# uninstall multiple times to make sure the env is clean
for _loop in range(3):
subprocess.check_call(uninstall_torch_cmd)
pytorch_nightly_url = f"https://download.pytorch.org/whl/nightly/{CUDA_VERSION_MAP[cuda_version]['pytorch_url']}/torch_nightly.html"
install_torch_cmd = ["pip", "install", "--pre", "torch", "torchvision", "torchtext", "-f", pytorch_nightly_url]
if dryrun:
print(f"Install pytorch nightly: {install_torch_cmd}")
else:
subprocess.check_call(install_torch_cmd, env=env)