diff --git a/.github/workflows/benchmark-config.yml b/.github/workflows/benchmark-config.yml index d4cb0c5ac2..ebdae70696 100644 --- a/.github/workflows/benchmark-config.yml +++ b/.github/workflows/benchmark-config.yml @@ -67,7 +67,7 @@ jobs: exit 1 fi # Install PyTorch nightly from pip - pip install --pre torch torchtext torchvision \ + pip install --pre torch torchtext torchvision torchaudio \ -f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html # make sure pytorch+cuda works python -c "import torch; torch.cuda.init()" diff --git a/.github/workflows/pr-gha-runner.yml b/.github/workflows/pr-gha-runner.yml index 995f876555..0e0e0a48fd 100644 --- a/.github/workflows/pr-gha-runner.yml +++ b/.github/workflows/pr-gha-runner.yml @@ -4,8 +4,9 @@ on: workflow_dispatch: env: - PYTHON_VERSION: "3.8" + PYTHON_VERSION: "3.10" CUDA_VERSION: "cu116" + CONDA_ENV: "pr-test" MAGMA_VERSION: "magma-cuda116" SETUP_INSTANCE_SCRIPT: "/workspace/setup_instance.sh" @@ -22,20 +23,37 @@ jobs: sudo LD_LIBRARY_PATH=/usr/local/nvidia/lib64:$LD_LIBRARY_PATH nvidia-smi -pm 1 sudo LD_LIBRARY_PATH=/usr/local/nvidia/lib64:$LD_LIBRARY_PATH nvidia-smi -ac 1215,1410 nvidia-smi + - name: Setup Conda Env + run: | + . "${SETUP_INSTANCE_SCRIPT}" + conda create -n "${CONDA_ENV}" python="${PYTHON_VERSION}" + conda activate "${CONDA_ENV}" + conda install -y "${MAGMA_VERSION}" -c pytorch + conda install -y numpy requests ninja pyyaml setuptools gitpython beautifulsoup4 regex + conda install -y expecttest -c conda-forge + pip install unittest-xml-reporting - name: Install PyTorch nightly run: | . "${SETUP_INSTANCE_SCRIPT}" - bash ./scripts/install_nightlies.sh + conda activate "${CONDA_ENV}" + pip install --pre torch torchvision torchtext torchaudio -f https://download.pytorch.org/whl/nightly/cu116/torch_nightly.html - name: Install TorchBench run: | . "${SETUP_INSTANCE_SCRIPT}" + conda activate "${CONDA_ENV}" python install.py - name: Validate benchmark components (Worker) run: | . "${SETUP_INSTANCE_SCRIPT}" + conda activate "${CONDA_ENV}" python -m components.test.test_subprocess python -m components.test.test_worker - name: Validate benchmark components (Model) run: | . "${SETUP_INSTANCE_SCRIPT}" + conda activate "${CONDA_ENV}" python test.py + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true diff --git a/.github/workflows/pr-gpu-stability-ci.yml b/.github/workflows/pr-gpu-stability-ci.yml index b638f4981a..62e6328d39 100644 --- a/.github/workflows/pr-gpu-stability-ci.yml +++ b/.github/workflows/pr-gpu-stability-ci.yml @@ -33,7 +33,7 @@ jobs: conda install -y numpy requests=2.22 ninja pyyaml mkl mkl-include setuptools \ cmake cffi typing_extensions future six dataclasses tabulate gitpython git-lfs # Install pytorch nightly - pip install --pre torch torchtext torchvision \ + pip install --pre torch torchtext torchvision torchaudio \ -f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html # Install torchbench dependencies python install.py diff --git a/.github/workflows/userbenchmark-t4-metal.yml b/.github/workflows/userbenchmark-t4-metal.yml index 36284f18ad..ed0a3370a8 100644 --- a/.github/workflows/userbenchmark-t4-metal.yml +++ b/.github/workflows/userbenchmark-t4-metal.yml @@ -47,7 +47,7 @@ jobs: exit 1 fi # Install PyTorch and torchvision nightly from pip - pip install --pre torch torchvision torchtext \ + pip install --pre torch torchvision torchtext torchaudio \ -f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html # make sure pytorch+cuda works python -c "import torch; torch.cuda.init()" diff --git a/.github/workflows/v1-nightly.yml b/.github/workflows/v1-nightly.yml index 8acd08ae89..ffbc1166c2 100644 --- a/.github/workflows/v1-nightly.yml +++ b/.github/workflows/v1-nightly.yml @@ -46,7 +46,7 @@ jobs: exit 1 fi # Install PyTorch nightly from pip - pip install --pre torch torchtext torchvision \ + pip install --pre torch torchtext torchvision torchaudio \ -f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html - name: Install other TorchBench dependencies run: | diff --git a/.github/workflows/v2-nightly.yml b/.github/workflows/v2-nightly.yml index 88e8f73cbb..6adc5eed86 100644 --- a/.github/workflows/v2-nightly.yml +++ b/.github/workflows/v2-nightly.yml @@ -48,7 +48,7 @@ jobs: # Install magma conda install -y -c pytorch "${MAGMA_VERSION}" # Install PyTorch nightly from pip - pip install --pre torch torchtext torchvision \ + pip install --pre torch torchtext torchvision torchaudio \ -f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html - name: Install other TorchBench dependencies run: | diff --git a/bisection.py b/bisection.py index 79ca7487d2..583f951759 100644 --- a/bisection.py +++ b/bisection.py @@ -1,7 +1,7 @@ """bisection.py Runs bisection to determine PRs that cause performance change. -It assumes that the pytorch, torchbench, torchtext and torchvision repositories provided are all clean with the latest code. -By default, the torchvision and torchtext package version will be fixed to the latest commit on the pytorch commit date. +It assumes that the pytorch, torchbench, torchtext, torchvision, and torchaudio repositories provided are all clean with the latest code. +By default, the torchaudio, torchvision and torchtext packages will be fixed to the latest commit on the same pytorch commit date. Usage: python bisection.py --work-dir \ @@ -29,6 +29,7 @@ TORCHBENCH_DEPS = { "torchtext": (os.path.expandvars("${HOME}/text"), "main"), "torchvision": (os.path.expandvars("${HOME}/vision"), "main"), + "torchaudio": (os.path.expandvars("${HOME}/audio"), "main"), } def exist_dir_path(string): @@ -151,7 +152,7 @@ def prep(self, build_env: os._Environ) -> bool: self.build_env = build_env return True - # Update pytorch, torchtext, and torchvision repo + # Update pytorch, torchtext, torchvision, and torchaudio repo def update_repos(self): repos = [(self.srcpath, "master")] repos.extend(TORCHBENCH_DEPS.values()) @@ -215,6 +216,10 @@ def build_install_deps(self, build_env): print(f"Building torchtext ...", end="", flush=True) command = "python setup.py clean install" subprocess.check_call(command, cwd=TORCHBENCH_DEPS["torchtext"][0], env=build_env, shell=True) + # Build torchaudio + print(f"Building torchaudio ...", end="", flush=True) + command = "python setup.py clean install" + subprocess.check_call(command, cwd=TORCHBENCH_DEPS["torchaudio"][0], env=build_env, shell=True) print("done") def _build_lazy_tensor(self, commit: Commit, build_env: Dict[str, str]): @@ -261,7 +266,7 @@ def build(self, commit: Commit): self.build_install_deps(build_env) def cleanup(self): - packages = ["torch", "torchtext", "torchvision"] + packages = ["torch", "torchtext", "torchvision", "torchaudio"] CLEANUP_ROUND = 5 # Clean up multiple times to make sure the packages are all uninstalled for _ in range(CLEANUP_ROUND): diff --git a/components/_impl/workers/subprocess_rpc.py b/components/_impl/workers/subprocess_rpc.py index aa7de23cc6..f3e0a15963 100644 --- a/components/_impl/workers/subprocess_rpc.py +++ b/components/_impl/workers/subprocess_rpc.py @@ -358,16 +358,28 @@ def from_exception(e: Exception, tb: types.TracebackType) -> "SerializedExceptio """ try: print_file = io.StringIO() - traceback.print_exception( - etype=type(e), - value=e, - tb=tb, - file=print_file, - ) + python_vinfo = sys.version_info + if python_vinfo.major == 3 and python_vinfo.minor < 10: + # Starting from Python 3.10, trackback renames the `etype` parameter to `exc` + # and make it positional-only. + # doc: https://docs.python.org/3/library/traceback.html#traceback.print_exception + traceback.print_exception( + etype=type(e), + value=e, + tb=tb, + file=print_file, + ) + else: + traceback.print_exception( + type(e), + value=e, + tb=tb, + file=print_file, + ) print_file.seek(0) traceback_print: str = print_file.read() - except Exception: + except Exception as e: traceback_print = textwrap.dedent(""" Traceback Failed to extract traceback from worker. This is not expected. diff --git a/docker/gcp-a100-runner-dind.dockerfile b/docker/gcp-a100-runner-dind.dockerfile index 0d717b3aca..0119611c48 100644 --- a/docker/gcp-a100-runner-dind.dockerfile +++ b/docker/gcp-a100-runner-dind.dockerfile @@ -45,11 +45,11 @@ RUN wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.s chmod +x Miniconda3-latest-Linux-x86_64.sh && \ bash ./Miniconda3-latest-Linux-x86_64.sh -b -u -# Use Python 3.8 as default +# Use Python 3.10 as default RUN . ${HOME}/miniconda3/etc/profile.d/conda.sh && \ conda activate base && \ conda init && \ - conda install -y python=3.8 && \ + conda install -y python=3.10 && \ pip install unittest-xml-reporting pyyaml RUN echo "\ diff --git a/scripts/install_nightlies.sh b/scripts/install_nightlies.sh index 75b767d4e4..526d7581c4 100755 --- a/scripts/install_nightlies.sh +++ b/scripts/install_nightlies.sh @@ -1,14 +1,20 @@ #!/bin/bash -set -e +set -ex . ~/miniconda3/etc/profile.d/conda.sh -conda activate base + +if [[ -z "${CONDA_ENV}" ]]; then + conda activate base +else + conda activate "${CONDA_ENV}" +fi conda install -y numpy requests ninja pyyaml setuptools gitpython beautifulsoup4 regex conda install -y -c pytorch magma-cuda116 # install the most recent successfully built pytorch packages -python torchbenchmark/util/torch_nightly.py --install-nightlies --packages torch torchvision torchtext +# torchaudio is required by fairseq/fambench_xlmr +pip install --pre torch torchvision torchtext torchaudio -f https://download.pytorch.org/whl/nightly/cu116/torch_nightly.html conda install -y expecttest -c conda-forge diff --git a/torchbenchmark/models/attention_is_all_you_need_pytorch/install.py b/torchbenchmark/models/attention_is_all_you_need_pytorch/install.py index f9fe0058ef..c0b1f179f4 100644 --- a/torchbenchmark/models/attention_is_all_you_need_pytorch/install.py +++ b/torchbenchmark/models/attention_is_all_you_need_pytorch/install.py @@ -14,12 +14,12 @@ def preprocess(): multi30k_data_dir = os.path.join(current_dir.parent.parent, "data", ".data", "multi30k") root = os.path.join(str(Path(__file__).parent), ".data") os.makedirs(root, exist_ok=True) - subprocess.check_call([sys.executable, 'preprocess.py', '-lang_src', 'de', '-lang_trg', 'en', '-share_vocab', + subprocess.check_call([sys.executable, 'preprocess.py', '-lang_src', 'de_core_news_sm', '-lang_trg', 'en_core_web_sm', '-share_vocab', '-save_data', os.path.join(root, 'm30k_deen_shr.pkl'), '-data_path', multi30k_data_dir]) if __name__ == '__main__': pip_install_requirements() - spacy_download('en') - spacy_download('de') + spacy_download('en_core_web_sm') + spacy_download('de_core_news_sm') # Preprocessed pkl is larger than 100MB so we cannot skip preprocess preprocess() diff --git a/torchbenchmark/models/attention_is_all_you_need_pytorch/preprocess.py b/torchbenchmark/models/attention_is_all_you_need_pytorch/preprocess.py index f06eb51764..d2681ed370 100644 --- a/torchbenchmark/models/attention_is_all_you_need_pytorch/preprocess.py +++ b/torchbenchmark/models/attention_is_all_you_need_pytorch/preprocess.py @@ -266,7 +266,7 @@ def main_wo_bpe(): Usage: python preprocess.py -lang_src de -lang_trg en -save_data multi30k_de_en.pkl -share_vocab ''' - spacy_support_langs = ['de', 'el', 'en', 'es', 'fr', 'it', 'lt', 'nb', 'nl', 'pt'] + spacy_support_langs = ['de_core_news_sm', 'el_core_news_sm', 'en_core_web_sm', 'es_core_news_sm', 'fr_core_news_sm', 'it_core_news_sm', 'lt_core_news_sm', 'nb_core_news_sm', 'nl_core_news_sm', 'pt_core_news_sm'] parser = argparse.ArgumentParser() parser.add_argument('-lang_src', required=True, choices=spacy_support_langs) @@ -309,7 +309,7 @@ def tokenize_trg(text): MIN_FREQ = opt.min_word_count if not all([opt.data_src, opt.data_trg]): - assert {opt.lang_src, opt.lang_trg} == {'de', 'en'} + assert {opt.lang_src, opt.lang_trg} == {'de_core_news_sm', 'en_core_web_sm'} else: # Pack custom txt file into example datasets raise NotImplementedError @@ -317,8 +317,11 @@ def tokenize_trg(text): def filter_examples_with_length(x): return len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN + def get_short_lang(full_lang): + return full_lang.split('_')[0] + train, val, test = Multi30k.splits( - exts = ('.' + opt.lang_src, '.' + opt.lang_trg), + exts = ('.' + get_short_lang(opt.lang_src), '.' + get_short_lang(opt.lang_trg)), fields = (SRC, TRG), filter_pred=filter_examples_with_length, path=opt.data_path) diff --git a/torchbenchmark/models/attention_is_all_you_need_pytorch/requirements.txt b/torchbenchmark/models/attention_is_all_you_need_pytorch/requirements.txt index be0172ac3b..44cb5b682a 100644 --- a/torchbenchmark/models/attention_is_all_you_need_pytorch/requirements.txt +++ b/torchbenchmark/models/attention_is_all_you_need_pytorch/requirements.txt @@ -1,5 +1,5 @@ -dill==0.3.4 +dill==0.3.5.1 tqdm iopath numpy -spacy==2.3.5 +spacy diff --git a/torchbenchmark/models/fambench_xlmr/install.py b/torchbenchmark/models/fambench_xlmr/install.py index a7e4df7bf3..c2593ecf3e 100644 --- a/torchbenchmark/models/fambench_xlmr/install.py +++ b/torchbenchmark/models/fambench_xlmr/install.py @@ -11,7 +11,15 @@ def update_fambench_submodule(): subprocess.check_call(update_command, cwd=REPO_PATH) def pip_install_requirements(): - subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt']) + try: + subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt']) + # pin fairseq version to 0.12.2 + # ignore deps specified in requirements.txt + subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-deps', 'fairseq==0.12.2']) + except subprocess.CalledProcessError: + # We ignore the ResolutionImpossible error because fairseq requires omegaconf < 2.1 + # but detectron2 requires omegaconf >= 2.1 + pass if __name__ == "__main__": update_fambench_submodule() diff --git a/torchbenchmark/models/fambench_xlmr/metadata.yaml b/torchbenchmark/models/fambench_xlmr/metadata.yaml index 3ff263e6b7..fcf96d50db 100644 --- a/torchbenchmark/models/fambench_xlmr/metadata.yaml +++ b/torchbenchmark/models/fambench_xlmr/metadata.yaml @@ -1,6 +1,8 @@ devices: NVIDIA A100-SXM4-40GB: eval_batch_size: 64 + cpu: + eval_batch_size: 4 eval_benchmark: false eval_deterministic: false eval_nograd: true diff --git a/torchbenchmark/models/fambench_xlmr/requirements.txt b/torchbenchmark/models/fambench_xlmr/requirements.txt index da75cfcd12..4436495f06 100644 --- a/torchbenchmark/models/fambench_xlmr/requirements.txt +++ b/torchbenchmark/models/fambench_xlmr/requirements.txt @@ -1,8 +1,6 @@ sacrebleu bitarray -# pin fairseq version -fairseq==0.10.2 -omegaconf==2.1.1 -hydra-core==1.1.2 +cffi +omegaconf +hydra-core sentencepiece -xformers diff --git a/torchbenchmark/models/yolov3/requirements.txt b/torchbenchmark/models/yolov3/requirements.txt index e1f7622346..25f43971ee 100755 --- a/torchbenchmark/models/yolov3/requirements.txt +++ b/torchbenchmark/models/yolov3/requirements.txt @@ -1,12 +1,12 @@ # pip install -U -r requirements.txt numpy # opencv-python 4.5 requires numpy 1.8 -opencv-python >= 4.1, < 4.5 +opencv-python matplotlib pycocotools tqdm pillow -tensorboard >= 1.14 +tensorboard # Nvidia Apex (optional) for mixed precision training -------------------------- # git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex diff --git a/torchbenchmark/util/model.py b/torchbenchmark/util/model.py index 157483ce89..bdf919b90d 100644 --- a/torchbenchmark/util/model.py +++ b/torchbenchmark/util/model.py @@ -169,9 +169,12 @@ def determine_batch_size(self, batch_size=None): if not batch_size: self.batch_size = self.DEFAULT_TRAIN_BSIZE if self.test == "train" else self.DEFAULT_EVAL_BSIZE # use the device suggestion on CUDA inference tests - if self.test == "eval" and self.device == "cuda": - current_device_name = torch.cuda.get_device_name() - assert current_device_name, f"torch.cuda.get_device_name() returns None when device is set to cuda, please double check." + if self.test == "eval": + if self.device == "cuda": + current_device_name = torch.cuda.get_device_name() + assert current_device_name, f"torch.cuda.get_device_name() returns None when device is set to cuda, please double check." + elif self.device == "cpu": + current_device_name = "cpu" if self.metadata and "devices" in self.metadata and current_device_name in self.metadata["devices"]: self.batch_size = self.metadata["devices"][current_device_name]["eval_batch_size"] # If the model doesn't implement test or eval test diff --git a/utils/__init__.py b/utils/__init__.py index 1f02075b18..6842af637a 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -2,7 +2,8 @@ from urllib import request from typing import List, Dict -TORCH_DEPS = ['torch', 'torchvision', 'torchtext'] +TORCH_DEPS = ['torch', 'torchvision', 'torchtext', 'torchaudio'] + proxy_suggestion = "Unable to verify https connectivity, " \ "required for setup.\n" \ "Do you need to use a proxy?" diff --git a/utils/cuda_utils.py b/utils/cuda_utils.py index 7ded084ce7..8901a9f0bf 100644 --- a/utils/cuda_utils.py +++ b/utils/cuda_utils.py @@ -56,7 +56,7 @@ def prepare_cuda_env(cuda_version: str, dryrun=False): return env def install_pytorch_nightly(cuda_version: str, env, dryrun=False): - uninstall_torch_cmd = ["pip", "uninstall", "-y", "torch", "torchvision", "torchtext"] + uninstall_torch_cmd = ["pip", "uninstall", "-y", "torch", "torchvision", "torchtext", "torchaudio"] if dryrun: print(f"Uninstall pytorch: {uninstall_torch_cmd}") else: @@ -64,7 +64,7 @@ def install_pytorch_nightly(cuda_version: str, env, dryrun=False): for _loop in range(3): subprocess.check_call(uninstall_torch_cmd) pytorch_nightly_url = f"https://download.pytorch.org/whl/nightly/{CUDA_VERSION_MAP[cuda_version]['pytorch_url']}/torch_nightly.html" - install_torch_cmd = ["pip", "install", "--pre", "torch", "torchvision", "torchtext", "-f", pytorch_nightly_url] + install_torch_cmd = ["pip", "install", "--pre", "torch", "torchvision", "torchtext", "torchaudio", "-f", pytorch_nightly_url] if dryrun: print(f"Install pytorch nightly: {install_torch_cmd}") else: