Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/benchmark-config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ jobs:
exit 1
fi
# Install PyTorch nightly from pip
pip install --pre torch torchtext torchvision \
pip install --pre torch torchtext torchvision torchaudio \
-f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html
# make sure pytorch+cuda works
python -c "import torch; torch.cuda.init()"
Expand Down
22 changes: 20 additions & 2 deletions .github/workflows/pr-gha-runner.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@ on:
workflow_dispatch:

env:
PYTHON_VERSION: "3.8"
PYTHON_VERSION: "3.10"
CUDA_VERSION: "cu116"
CONDA_ENV: "pr-test"
MAGMA_VERSION: "magma-cuda116"
SETUP_INSTANCE_SCRIPT: "/workspace/setup_instance.sh"

Expand All @@ -22,20 +23,37 @@ jobs:
sudo LD_LIBRARY_PATH=/usr/local/nvidia/lib64:$LD_LIBRARY_PATH nvidia-smi -pm 1
sudo LD_LIBRARY_PATH=/usr/local/nvidia/lib64:$LD_LIBRARY_PATH nvidia-smi -ac 1215,1410
nvidia-smi
- name: Setup Conda Env
run: |
. "${SETUP_INSTANCE_SCRIPT}"
conda create -n "${CONDA_ENV}" python="${PYTHON_VERSION}"
conda activate "${CONDA_ENV}"
conda install -y "${MAGMA_VERSION}" -c pytorch
conda install -y numpy requests ninja pyyaml setuptools gitpython beautifulsoup4 regex
conda install -y expecttest -c conda-forge
pip install unittest-xml-reporting
- name: Install PyTorch nightly
run: |
. "${SETUP_INSTANCE_SCRIPT}"
bash ./scripts/install_nightlies.sh
conda activate "${CONDA_ENV}"
pip install --pre torch torchvision torchtext torchaudio -f https://download.pytorch.org/whl/nightly/cu116/torch_nightly.html
- name: Install TorchBench
run: |
. "${SETUP_INSTANCE_SCRIPT}"
conda activate "${CONDA_ENV}"
python install.py
- name: Validate benchmark components (Worker)
run: |
. "${SETUP_INSTANCE_SCRIPT}"
conda activate "${CONDA_ENV}"
python -m components.test.test_subprocess
python -m components.test.test_worker
- name: Validate benchmark components (Model)
run: |
. "${SETUP_INSTANCE_SCRIPT}"
conda activate "${CONDA_ENV}"
python test.py

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
2 changes: 1 addition & 1 deletion .github/workflows/pr-gpu-stability-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ jobs:
conda install -y numpy requests=2.22 ninja pyyaml mkl mkl-include setuptools \
cmake cffi typing_extensions future six dataclasses tabulate gitpython git-lfs
# Install pytorch nightly
pip install --pre torch torchtext torchvision \
pip install --pre torch torchtext torchvision torchaudio \
-f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html
# Install torchbench dependencies
python install.py
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/userbenchmark-t4-metal.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
exit 1
fi
# Install PyTorch and torchvision nightly from pip
pip install --pre torch torchvision torchtext \
pip install --pre torch torchvision torchtext torchaudio \
-f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html
# make sure pytorch+cuda works
python -c "import torch; torch.cuda.init()"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/v1-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ jobs:
exit 1
fi
# Install PyTorch nightly from pip
pip install --pre torch torchtext torchvision \
pip install --pre torch torchtext torchvision torchaudio \
-f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html
- name: Install other TorchBench dependencies
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/v2-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
# Install magma
conda install -y -c pytorch "${MAGMA_VERSION}"
# Install PyTorch nightly from pip
pip install --pre torch torchtext torchvision \
pip install --pre torch torchtext torchvision torchaudio \
-f https://download.pytorch.org/whl/nightly/${CUDA_VERSION}/torch_nightly.html
- name: Install other TorchBench dependencies
run: |
Expand Down
13 changes: 9 additions & 4 deletions bisection.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""bisection.py
Runs bisection to determine PRs that cause performance change.
It assumes that the pytorch, torchbench, torchtext and torchvision repositories provided are all clean with the latest code.
By default, the torchvision and torchtext package version will be fixed to the latest commit on the pytorch commit date.
It assumes that the pytorch, torchbench, torchtext, torchvision, and torchaudio repositories provided are all clean with the latest code.
By default, the torchaudio, torchvision and torchtext packages will be fixed to the latest commit on the same pytorch commit date.

Usage:
python bisection.py --work-dir <WORK-DIR> \
Expand Down Expand Up @@ -29,6 +29,7 @@
TORCHBENCH_DEPS = {
"torchtext": (os.path.expandvars("${HOME}/text"), "main"),
"torchvision": (os.path.expandvars("${HOME}/vision"), "main"),
"torchaudio": (os.path.expandvars("${HOME}/audio"), "main"),
}

def exist_dir_path(string):
Expand Down Expand Up @@ -151,7 +152,7 @@ def prep(self, build_env: os._Environ) -> bool:
self.build_env = build_env
return True

# Update pytorch, torchtext, and torchvision repo
# Update pytorch, torchtext, torchvision, and torchaudio repo
def update_repos(self):
repos = [(self.srcpath, "master")]
repos.extend(TORCHBENCH_DEPS.values())
Expand Down Expand Up @@ -215,6 +216,10 @@ def build_install_deps(self, build_env):
print(f"Building torchtext ...", end="", flush=True)
command = "python setup.py clean install"
subprocess.check_call(command, cwd=TORCHBENCH_DEPS["torchtext"][0], env=build_env, shell=True)
# Build torchaudio
print(f"Building torchaudio ...", end="", flush=True)
command = "python setup.py clean install"
subprocess.check_call(command, cwd=TORCHBENCH_DEPS["torchaudio"][0], env=build_env, shell=True)
print("done")

def _build_lazy_tensor(self, commit: Commit, build_env: Dict[str, str]):
Expand Down Expand Up @@ -261,7 +266,7 @@ def build(self, commit: Commit):
self.build_install_deps(build_env)

def cleanup(self):
packages = ["torch", "torchtext", "torchvision"]
packages = ["torch", "torchtext", "torchvision", "torchaudio"]
CLEANUP_ROUND = 5
# Clean up multiple times to make sure the packages are all uninstalled
for _ in range(CLEANUP_ROUND):
Expand Down
26 changes: 19 additions & 7 deletions components/_impl/workers/subprocess_rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,16 +358,28 @@ def from_exception(e: Exception, tb: types.TracebackType) -> "SerializedExceptio
"""
try:
print_file = io.StringIO()
traceback.print_exception(
etype=type(e),
value=e,
tb=tb,
file=print_file,
)
python_vinfo = sys.version_info
if python_vinfo.major == 3 and python_vinfo.minor < 10:
# Starting from Python 3.10, trackback renames the `etype` parameter to `exc`
# and make it positional-only.
# doc: https://docs.python.org/3/library/traceback.html#traceback.print_exception
traceback.print_exception(
etype=type(e),
value=e,
tb=tb,
file=print_file,
)
else:
traceback.print_exception(
type(e),
value=e,
tb=tb,
file=print_file,
)
print_file.seek(0)
traceback_print: str = print_file.read()

except Exception:
except Exception as e:
traceback_print = textwrap.dedent("""
Traceback
Failed to extract traceback from worker. This is not expected.
Expand Down
4 changes: 2 additions & 2 deletions docker/gcp-a100-runner-dind.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ RUN wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.s
chmod +x Miniconda3-latest-Linux-x86_64.sh && \
bash ./Miniconda3-latest-Linux-x86_64.sh -b -u

# Use Python 3.8 as default
# Use Python 3.10 as default
RUN . ${HOME}/miniconda3/etc/profile.d/conda.sh && \
conda activate base && \
conda init && \
conda install -y python=3.8 && \
conda install -y python=3.10 && \
pip install unittest-xml-reporting pyyaml

RUN echo "\
Expand Down
12 changes: 9 additions & 3 deletions scripts/install_nightlies.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
#!/bin/bash
set -e
set -ex

. ~/miniconda3/etc/profile.d/conda.sh
conda activate base

if [[ -z "${CONDA_ENV}" ]]; then
conda activate base
else
conda activate "${CONDA_ENV}"
fi

conda install -y numpy requests ninja pyyaml setuptools gitpython beautifulsoup4 regex
conda install -y -c pytorch magma-cuda116

# install the most recent successfully built pytorch packages
python torchbenchmark/util/torch_nightly.py --install-nightlies --packages torch torchvision torchtext
# torchaudio is required by fairseq/fambench_xlmr
pip install --pre torch torchvision torchtext torchaudio -f https://download.pytorch.org/whl/nightly/cu116/torch_nightly.html

conda install -y expecttest -c conda-forge

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ def preprocess():
multi30k_data_dir = os.path.join(current_dir.parent.parent, "data", ".data", "multi30k")
root = os.path.join(str(Path(__file__).parent), ".data")
os.makedirs(root, exist_ok=True)
subprocess.check_call([sys.executable, 'preprocess.py', '-lang_src', 'de', '-lang_trg', 'en', '-share_vocab',
subprocess.check_call([sys.executable, 'preprocess.py', '-lang_src', 'de_core_news_sm', '-lang_trg', 'en_core_web_sm', '-share_vocab',
'-save_data', os.path.join(root, 'm30k_deen_shr.pkl'), '-data_path', multi30k_data_dir])

if __name__ == '__main__':
pip_install_requirements()
spacy_download('en')
spacy_download('de')
spacy_download('en_core_web_sm')
spacy_download('de_core_news_sm')
# Preprocessed pkl is larger than 100MB so we cannot skip preprocess
preprocess()
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def main_wo_bpe():
Usage: python preprocess.py -lang_src de -lang_trg en -save_data multi30k_de_en.pkl -share_vocab
'''

spacy_support_langs = ['de', 'el', 'en', 'es', 'fr', 'it', 'lt', 'nb', 'nl', 'pt']
spacy_support_langs = ['de_core_news_sm', 'el_core_news_sm', 'en_core_web_sm', 'es_core_news_sm', 'fr_core_news_sm', 'it_core_news_sm', 'lt_core_news_sm', 'nb_core_news_sm', 'nl_core_news_sm', 'pt_core_news_sm']

parser = argparse.ArgumentParser()
parser.add_argument('-lang_src', required=True, choices=spacy_support_langs)
Expand Down Expand Up @@ -309,16 +309,19 @@ def tokenize_trg(text):
MIN_FREQ = opt.min_word_count

if not all([opt.data_src, opt.data_trg]):
assert {opt.lang_src, opt.lang_trg} == {'de', 'en'}
assert {opt.lang_src, opt.lang_trg} == {'de_core_news_sm', 'en_core_web_sm'}
else:
# Pack custom txt file into example datasets
raise NotImplementedError

def filter_examples_with_length(x):
return len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN

def get_short_lang(full_lang):
return full_lang.split('_')[0]

train, val, test = Multi30k.splits(
exts = ('.' + opt.lang_src, '.' + opt.lang_trg),
exts = ('.' + get_short_lang(opt.lang_src), '.' + get_short_lang(opt.lang_trg)),
fields = (SRC, TRG),
filter_pred=filter_examples_with_length,
path=opt.data_path)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
dill==0.3.4
dill==0.3.5.1
tqdm
iopath
numpy
spacy==2.3.5
spacy
10 changes: 9 additions & 1 deletion torchbenchmark/models/fambench_xlmr/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,15 @@ def update_fambench_submodule():
subprocess.check_call(update_command, cwd=REPO_PATH)

def pip_install_requirements():
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt'])
try:
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt'])
# pin fairseq version to 0.12.2
# ignore deps specified in requirements.txt
subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--no-deps', 'fairseq==0.12.2'])
except subprocess.CalledProcessError:
# We ignore the ResolutionImpossible error because fairseq requires omegaconf < 2.1
# but detectron2 requires omegaconf >= 2.1
pass

if __name__ == "__main__":
update_fambench_submodule()
Expand Down
2 changes: 2 additions & 0 deletions torchbenchmark/models/fambench_xlmr/metadata.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
devices:
NVIDIA A100-SXM4-40GB:
eval_batch_size: 64
cpu:
eval_batch_size: 4
eval_benchmark: false
eval_deterministic: false
eval_nograd: true
Expand Down
8 changes: 3 additions & 5 deletions torchbenchmark/models/fambench_xlmr/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
sacrebleu
bitarray
# pin fairseq version
fairseq==0.10.2
omegaconf==2.1.1
hydra-core==1.1.2
cffi
omegaconf
hydra-core
sentencepiece
xformers
4 changes: 2 additions & 2 deletions torchbenchmark/models/yolov3/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
# pip install -U -r requirements.txt
numpy
# opencv-python 4.5 requires numpy 1.8
opencv-python >= 4.1, < 4.5
opencv-python
matplotlib
pycocotools
tqdm
pillow
tensorboard >= 1.14
tensorboard

# Nvidia Apex (optional) for mixed precision training --------------------------
# git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" . --user && cd .. && rm -rf apex
Expand Down
9 changes: 6 additions & 3 deletions torchbenchmark/util/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,12 @@ def determine_batch_size(self, batch_size=None):
if not batch_size:
self.batch_size = self.DEFAULT_TRAIN_BSIZE if self.test == "train" else self.DEFAULT_EVAL_BSIZE
# use the device suggestion on CUDA inference tests
if self.test == "eval" and self.device == "cuda":
current_device_name = torch.cuda.get_device_name()
assert current_device_name, f"torch.cuda.get_device_name() returns None when device is set to cuda, please double check."
if self.test == "eval":
if self.device == "cuda":
current_device_name = torch.cuda.get_device_name()
assert current_device_name, f"torch.cuda.get_device_name() returns None when device is set to cuda, please double check."
elif self.device == "cpu":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious what is this change for?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We now support specifying a smaller batch size for the CPU device. In the code, we try to use the same batch size as upstream. However, upstream batch sizes are often optimized for GPU, for CPU+inference tests, we want to use a smaller batch size by default to save test time.

current_device_name = "cpu"
if self.metadata and "devices" in self.metadata and current_device_name in self.metadata["devices"]:
self.batch_size = self.metadata["devices"][current_device_name]["eval_batch_size"]
# If the model doesn't implement test or eval test
Expand Down
3 changes: 2 additions & 1 deletion utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from urllib import request
from typing import List, Dict

TORCH_DEPS = ['torch', 'torchvision', 'torchtext']
TORCH_DEPS = ['torch', 'torchvision', 'torchtext', 'torchaudio']

proxy_suggestion = "Unable to verify https connectivity, " \
"required for setup.\n" \
"Do you need to use a proxy?"
Expand Down
4 changes: 2 additions & 2 deletions utils/cuda_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,15 @@ def prepare_cuda_env(cuda_version: str, dryrun=False):
return env

def install_pytorch_nightly(cuda_version: str, env, dryrun=False):
uninstall_torch_cmd = ["pip", "uninstall", "-y", "torch", "torchvision", "torchtext"]
uninstall_torch_cmd = ["pip", "uninstall", "-y", "torch", "torchvision", "torchtext", "torchaudio"]
if dryrun:
print(f"Uninstall pytorch: {uninstall_torch_cmd}")
else:
# uninstall multiple times to make sure the env is clean
for _loop in range(3):
subprocess.check_call(uninstall_torch_cmd)
pytorch_nightly_url = f"https://download.pytorch.org/whl/nightly/{CUDA_VERSION_MAP[cuda_version]['pytorch_url']}/torch_nightly.html"
install_torch_cmd = ["pip", "install", "--pre", "torch", "torchvision", "torchtext", "-f", pytorch_nightly_url]
install_torch_cmd = ["pip", "install", "--pre", "torch", "torchvision", "torchtext", "torchaudio", "-f", pytorch_nightly_url]
if dryrun:
print(f"Install pytorch nightly: {install_torch_cmd}")
else:
Expand Down