diff --git a/benchmarks/README.md b/benchmarks/README.md index 9476ecbcb50..fe4bdc309b4 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -77,7 +77,7 @@ Disable autoboost selecting clock rate based on thermal, and power budget effect Run the `experiment_runner.py` from the `pytorch` directory, which should be the parent of the `xla` directory. -The following example runs the alexnet benchmark on GPU through the +The following example runs the alexnet benchmark on CPU through the Pytorch/XLA-dynamo path and through the Inductor-dynamo with 5 repetitions each. The results will be stored in a json file (eg results.jsonl) in `experiment_results`. @@ -88,7 +88,7 @@ python xla/benchmarks/experiment_runner.py \ --xla=PJRT --xla=None \ --test=eval --test=train \ --suite-name=torchbench \ - --accelerator=cuda \ + --accelerator=cpu \ --output-dirname=experiment_results \ --repeat=5 \ --print-subprocess \ @@ -118,7 +118,7 @@ python xla/benchmarks/experiment_runner.py \ --suite-name=torchbench \ --progress-bar \ --model-config='{"model_name":"BERT_pytorch"}' \ - --experiment-config='{"accelerator":"cuda","xla":"PJRT","xla_flags":null,"dynamo":"openxla","torch_xla2":null,"test":"train","keep_model_data_on_cuda":false,"enable_functionalization":false}' \ + --experiment-config='{"accelerator":"cpu","xla":"PJRT","xla_flags":null,"dynamo":"openxla","torch_xla2":null,"test":"train","enable_functionalization":false}' \ --repeat 1 ``` @@ -135,13 +135,13 @@ works only for inference now. ``` cd pytorch -PJRT_DEVICE=CUDA python3 new_xla/benchmarks/experiment_runner.py \ +PJRT_DEVICE=CPU python3 new_xla/benchmarks/experiment_runner.py \ --xla=PJRT \ --dynamo=openxla \ --test=eval \ --filter=BERT_pytorch$ \ --suite-name=torchbench \ - --accelerator=cuda \ + --accelerator=cpu \ --progress-bar \ --output-dirname=/tmp/output \ --repeat=2 \ diff --git a/benchmarks/benchmark_experiment.py b/benchmarks/benchmark_experiment.py index e1fab48334a..daffdce2f7f 100644 --- a/benchmarks/benchmark_experiment.py +++ b/benchmarks/benchmark_experiment.py @@ -20,13 +20,12 @@ def list_experiment_configs(self): # Start with default config. config_choices = { - "accelerator": ["cpu", "cuda", "tpu"], + "accelerator": ["cpu", "tpu"], "xla": [None, "PJRT", "XRT"], "xla_flags": [None], "dynamo": [None, "inductor", "openxla"], "torch_xla2": [None], # options only apply to torch_xla2 "test": ["eval", "train"], - "keep_model_data_on_cuda": [False], "enable_functionalization": [False], } @@ -46,10 +45,6 @@ def list_experiment_configs(self): if self._args.xla_flags: config_choices["xla_flags"] = list( map(parse_none_str, set(self._args.xla_flags))) - if self._args.keep_model_data_on_cuda: - config_choices["keep_model_data_on_cuda"] = [ - self._args.keep_model_data_on_cuda - ] if self._args.enable_functionalization: config_choices["enable_functionalization"] = [ self._args.enable_functionalization @@ -85,7 +80,6 @@ def _is_available(self, cfg_xla = experiment_config["xla"] cfg_test = experiment_config["test"] cfg_torch_xla2 = experiment_config["torch_xla2"] - cfg_keep_model_data_on_cuda = experiment_config["keep_model_data_on_cuda"] # Check that dynamo refers to an existing backend. if cfg_dynamo is not None and cfg_dynamo not in dynamo.list_backends( @@ -118,16 +112,16 @@ def _is_available(self, if cfg_accelerator == "tpu": if cfg_xla is None: return False - elif cfg_accelerator in ("cpu", "cuda"): + elif cfg_accelerator == "cpu": if cfg_xla == "XRT": return False + elif cfg_accelerator == "cuda": + if cfg_xla is not None: + # PyTorch/XLA with CUDA backend is no longer supported. + return False else: raise NotImplementedError - # cfg_keep_model_data_on_cuda is only avaible when using dynamo - if cfg_keep_model_data_on_cuda and cfg_dynamo != "openxla": - return False - return True def load_experiment(self, @@ -140,7 +134,6 @@ def load_experiment(self, test = experiment_config["test"] batch_size = experiment_config.get("batch_size", self._args.batch_size) torch_xla2 = experiment_config["torch_xla2"] - keep_model_data_on_cuda = experiment_config["keep_model_data_on_cuda"] enable_functionalization = experiment_config["enable_functionalization"] return BenchmarkExperiment( accelerator=accelerator, @@ -148,7 +141,6 @@ def load_experiment(self, xla_flags=xla_flags, dynamo=dynamo, torch_xla2=torch_xla2, - keep_model_data_on_cuda=keep_model_data_on_cuda, test=test, batch_size=batch_size, enable_functionalization=enable_functionalization, @@ -159,14 +151,12 @@ class BenchmarkExperiment: def __init__(self, accelerator: str, xla: Optional[str], xla_flags: Optional[str], dynamo: str, torch_xla2: bool, - keep_model_data_on_cuda: bool, test: str, batch_size: str, - enable_functionalization: bool): + test: str, batch_size: str, enable_functionalization: bool): self.accelerator = accelerator self.xla = xla self.xla_flags = xla_flags self.dynamo = dynamo self.torch_xla2 = torch_xla2 - self.keep_model_data_on_cuda = keep_model_data_on_cuda self.test = test self.batch_size = batch_size self.accelerator_model = get_accelerator_model(self.accelerator) @@ -191,8 +181,6 @@ def update_process_env(self, process_env: Dict[str, str]): if is_xla_device_available("TPU"): process_env["TPU_NUM_DEVICES"] = "1" process_env["XRT_TPU_CONFIG"] = "localservice;0;localhost:51011" - elif is_xla_device_available("CUDA"): - process_env["GPU_NUM_DEVICES"] = "1" elif self.xla is None: # In non-xla CPU training experiments, an env var is still needed if an # xla device exists, or there will be "Missing XLA configuration" error. @@ -246,7 +234,6 @@ def to_dict(self): d["xla_flags"] = self.xla_flags d["dynamo"] = self.dynamo d["torch_xla2"] = self.torch_xla2 - d["keep_model_data_on_cuda"] = self.keep_model_data_on_cuda d["test"] = self.test d["batch_size"] = self.batch_size d["enable_functionalization"] = self.enable_functionalization diff --git a/benchmarks/benchmark_model.py b/benchmarks/benchmark_model.py index 2b2f6c1957b..008a4539c7a 100644 --- a/benchmarks/benchmark_model.py +++ b/benchmarks/benchmark_model.py @@ -103,7 +103,6 @@ def prepare_for_experiment( else: raise NotImplementedError - keep_model_data_on_cuda = self.benchmark_experiment.keep_model_data_on_cuda if self.benchmark_experiment.torch_xla2: import torch_xla2.export import torch_xla2 @@ -125,7 +124,7 @@ def prepare_for_experiment( self.module = lambda *x: jax_func(weights, x) self.example_inputs = move_to_device( self.example_inputs, device, torch_xla2=True) - elif not keep_model_data_on_cuda: + else: self.module = self.module.to(self.device) self.example_inputs = move_to_device( self.example_inputs, self.device, torch_xla2=False) @@ -137,14 +136,6 @@ def prepare_for_experiment( logger.info(f"Running torch.compile with opts {compilation_opts}") self.model_iter_fn = torch.compile(self.model_iter_fn, **compilation_opts) - if keep_model_data_on_cuda: - - def assert_func(t): - assert t.device.type.lower( - ) == 'cuda', 'When keep_model_data_on_cuda is set, the input data should remain on the CUDA device.' - - pytree.tree_map_only(torch.Tensor, assert_func, self.example_inputs) - def pick_grad(self): if self.benchmark_experiment.test == "eval": return torch.no_grad() diff --git a/benchmarks/experiment_runner.py b/benchmarks/experiment_runner.py index b784af68e47..04a5524ad38 100644 --- a/benchmarks/experiment_runner.py +++ b/benchmarks/experiment_runner.py @@ -936,11 +936,6 @@ def __str__(self): help="""Collect CUDA and CPU times per operation. This will also gather CPU fallbacks.""", ) - parser.add_argument( - "--keep-model-data-on-cuda", - action="store_true", - help="""Whether to keep the model and data on CUDA and not to move to an XLA device. This is to be used with PyTorch/XLA dynamo. When set, PyTorch/XLA dynamo bridge move the model and data to the XLA device.""", - ) parser.add_argument( "--xla-flags", type=str, diff --git a/benchmarks/llama.py b/benchmarks/llama.py deleted file mode 100644 index 53e88ddb0d2..00000000000 --- a/benchmarks/llama.py +++ /dev/null @@ -1,269 +0,0 @@ -import argparse -import datetime -import logging -import json -import os -import re -import subprocess -import sys - -from enum import Enum - -logger = logging.getLogger(__name__) - - -def get_info_from_result_file(results_dir: str) -> tuple[str, str, float]: - results_file = os.path.join(results_dir, 'results.jsonl') - if not os.path.exists(results_file): - sys.exit(f"Results file {results_file} not found. " - "Please run experiment_runner.py first.") - accelerator_model = None - with open(results_file, 'r') as f: - first_line = f.readline() - acc_match = re.search(r'"accelerator_model": "([^"]+)"', first_line) - time_match = re.search(r'"timestamp": ([0-9.]+)', first_line) - if acc_match and time_match: - accelerator_model = acc_match.group(1) - timestamp = float(time_match.group(1)) - else: - sys.exit(f"Cannot find a timestamp and a matching accelerator " - "in {results_file}.") - logger.debug(f"Found accelerator_model='{accelerator_model}' and " - f"timestamp={timestamp} in {results_file}.") - return accelerator_model, timestamp - - -def set_up_llama_repo(workspace_dir: str) -> str: - llama_dir = os.path.join(workspace_dir, 'llama-inference') - if os.path.exists(llama_dir): - logger.debug(f'llama_dir={llama_dir} already exists; no setting up to do.') - return llama_dir - - logger.debug(f'Setting up llama repo at {llama_dir}.') - subprocess.check_call([ - 'git', 'clone', 'https://github.com/pytorch-tpu/llama.git', '--branch', - 'llama2-google-next-inference', llama_dir - ]) - subprocess.check_call( - ['pip', 'install', '-r', - os.path.join(llama_dir, 'requirements.txt')]) - subprocess.check_call(['pip', 'install', '-e', llama_dir]) - - # Create model JSON files - model_configs = { - '7b.json': { - "dim": 4096, - "multiple_of": 256, - "n_heads": 32, - "n_layers": 32, - "norm_eps": 1e-05, - "vocab_size": -1 - }, - '13b.json': { - "dim": 5120, - "multiple_of": 256, - "n_heads": 40, - "n_layers": 40, - "norm_eps": 1e-05, - "vocab_size": -1 - }, - '70b.json': { - "dim": 8192, - "multiple_of": 4096, - "ffn_dim_multiplier": 1.3, - "n_heads": 64, - "n_kv_heads": 8, - "n_layers": 80, - "norm_eps": 1e-05, - "vocab_size": -1 - } - } - for filename, config in model_configs.items(): - filepath = os.path.join(llama_dir, filename) - with open(filepath, 'w') as f: - json.dump(config, f) - f.write("\n") - return llama_dir - - -def parse_log_file(log_file: str): - latencies = [] - with open(log_file, 'r') as f: - for line in f: - if ('Totally decoded ' not in line or 'tokens in' not in line or - ' seconds' not in line): - continue - parts = line.strip().split() - tokens = float(parts[2]) - seconds = float(parts[5]) - latency_per_token = seconds / tokens - latencies.append(latency_per_token) - logger.debug(f'{log_file}: Found latencies={latencies}') - return latencies - - -def benchmark_has_already_run(results_file: str, model_name: str, xla: str, - dynamo: str, batch_size: int): - with open(results_file, 'r') as f: - for line in f: - # Grep for relevant lines to avoid parsing the entire JSONL file. - if f'"model_name": "{model_name}"' not in line: - continue - r = json.loads(line.rstrip('\n|\r')) - # yapf: disable - if all( - r.get(k1, {}).get(k2) == v - for (k1, k2, v) in [ - ('experiment', 'accelerator', 'cuda'), - ('experiment', 'batch_size', batch_size), - ('experiment', 'dynamo', dynamo), - ('experiment', 'test', 'eval'), - ('experiment', 'xla', xla), - ('experiment', 'xla_flags', None), - ('model', 'model_name', model_name), - ]): - return True - # yapf: enable - return False - - -def run_benchmarks(args, llama_dir: str, results_dir: str, - accelerator_model: str, timestamp: float): - os.chdir(llama_dir) - for size in ['7b', '13b', '70b']: - params_json = 'params.json' - if os.path.exists(params_json): - os.remove(params_json) - os.symlink(f'{size}.json', params_json) - model_name = f"llama2.{size}" - for dynamo in [None, 'inductor', 'openxla']: - backend = dynamo if dynamo else 'lazytensor' - xla = None if dynamo == 'inductor' else 'PJRT' - summary = f"{model_name} eval {backend} batch {args.batch_size}" - - results_file = os.path.join(results_dir, 'results.jsonl') - if benchmark_has_already_run(results_file, model_name, xla, dynamo, - args.batch_size): - logger.info(f"SKIP already completed benchmark -- {summary}") - continue - - logger.info(f"RUN {summary}") - log_file = os.path.join(results_dir, - f'llama-inference.{backend}.{size}.log') - - cmd = [ - 'python', 'example_text_completion.py', '1', '--ckpt_dir', '.', - '--tokenizer_path', - os.path.join(llama_dir, 't5_tokenizer/spiece.model'), '--max_seq_len', - '2048', '--max_gen_len', '1000', f'--max_batch_size', - f'{args.batch_size}', '--mp', 'True', f'--repeat', f'{args.repeat}', - f'--dynamo', f'"{dynamo}"' if dynamo else "''" - ] - - run_env = os.environ.copy() - if dynamo == 'inductor': - run_env['CUDA_VISIBLE_DEVICES'] = '0' - run_env['USE_CUDA'] = '1' - else: - run_env['PJRT_DEVICE'] = 'CUDA' - run_env['GPU_NUM_DEVICES'] = '1' - - run_ok = True - with open(log_file, 'w') as f: - try: - subprocess.check_call(cmd, stdout=f, stderr=f, env=run_env) - except subprocess.CalledProcessError: - logger.warning(f"Run failed -- see {log_file}.") - run_ok = False - - result = { - 'model': { - 'suite_name': 'llama2', - 'model_name': model_name, - }, - 'experiment': { - 'accelerator': 'cuda', - 'accelerator_model': accelerator_model, - 'xla': xla, - 'xla_flags': None, - 'dynamo': dynamo, - 'test': 'eval', - 'batch_size': args.batch_size, - }, - 'repeat': args.repeat, - 'iterations_per_run': 1, - 'metrics': { - # Filled in below. - }, - 'timestamp': timestamp, - } - if run_ok: - latencies = parse_log_file(log_file) - result['metrics']['total_time'] = latencies - else: - result['metrics']['error'] = f"Run failed -- see {log_file}." - - with open(results_file, mode="a", encoding="utf-8") as f: - json.dump(result, f, ensure_ascii=False) - f.write("\n") - - -def parse_args(): - # Helper class for --log-level flag. - class LogLevel(Enum): - critical = logging.CRITICAL - error = logging.ERROR - warning = logging.WARNING - info = logging.INFO - debug = logging.DEBUG - - @staticmethod - def parse(s: str): - try: - return LogLevel[s] - except KeyError: - raise ValueError() - - def __str__(self): - return self.name - - parser = argparse.ArgumentParser(description='Run Llama inference benchmarks') - parser.add_argument('--batch_size', type=int, default=1, help='Batch size.') - parser.add_argument( - '--log-level', - default=LogLevel.info, - choices=list(LogLevel), - type=LogLevel.parse, - help='Log level') - parser.add_argument( - '--repeat', type=int, default=8, help='Number of repetitions') - parser.add_argument( - '--workspace_dir', type=str, required=True, help='Workspace directory.') - args = parser.parse_args() - - return args - - -def main(): - args = parse_args() - logging.basicConfig(level=args.log_level.value, force=True) - args.workspace_dir = os.path.expanduser(args.workspace_dir) - if not os.path.exists(args.workspace_dir): - sys.exit(f"Workspace directory {args.workspace_dir} not found.") - - # Sanity check: we should already be inside the appropriate venv. - workspace_dir = os.path.realpath(args.workspace_dir) - logger.debug(f'workspace_dir realpath: {workspace_dir}') - if sys.prefix != os.path.join(workspace_dir, 'env'): - sys.exit( - "Error: must run under the Python venv from the given --workspace_dir.") - - results_dir = os.path.join(workspace_dir, 'experiment_results') - accelerator_model, timestamp = get_info_from_result_file(results_dir) - llama_dir = set_up_llama_repo(workspace_dir) - - run_benchmarks(args, llama_dir, results_dir, accelerator_model, timestamp) - - -if __name__ == "__main__": - main() diff --git a/benchmarks/nightly.sh b/benchmarks/nightly.sh deleted file mode 100755 index 64b34055cbf..00000000000 --- a/benchmarks/nightly.sh +++ /dev/null @@ -1,258 +0,0 @@ -#!/bin/bash -# Pytorch/XLA Nightly Benchmark Runner. - -set -ex - -ACCELERATOR=a100 -OUTPUT_DIR=${HOME:?} -WORKSPACE=$(date --utc +%Y-%m-%d) -REPEAT=8 -ENABLE_PROFILING= - -while getopts 'A:O:PR:T:W:' OPTION -do - case ${OPTION?} in - A) - ACCELERATOR=${OPTARG:?} - ;; - O) - OUTPUT_DIR=${OPTARG:?} - ;; - P) - ENABLE_PROFILING=1 - ;; - R) - REPEAT=${OPTARG:?} - ;; - T) - # Avoid printing the token; re-enable printing later. - { set +x; } 2>/dev/null - export HUGGING_FACE_HUB_TOKEN=${OPTARG:?} - set -x - ;; - W) - WORKSPACE=${OPTARG:?} - ;; - esac -done - -NIGHTLY_RUNS=nightly_runs -NIGHTLY_RESULTS=nightly_results -if [[ ${ENABLE_PROFILING?} ]]; then - NIGHTLY_RUNS=nightly_profiling_runs - NIGHTLY_RESULTS=nightly_profiling_results -fi -WORKSPACE_DIR=${OUTPUT_DIR:?}/${NIGHTLY_RUNS:?}/${WORKSPACE:?} -BM_DIR=${WORKSPACE_DIR:?}/pytorch/xla/benchmarks - -# Intermediate results, which are processed to generate reports. -WORKSPACE_RESULTS_DIR=${WORKSPACE_DIR:?}/experiment_results - -# Final data files and reports go here. -NIGHTLY_RESULTS_DIR=${OUTPUT_DIR:?}/${NIGHTLY_RESULTS:?} - -# Init workspace -# -# Sometimes a run fails halfway. Typically this is because -# experiment_runner crashes. We then fix the problem and -# run the script again, which skips the build phase. -IS_FRESH_RUN=1 # Set to null below; read with ${IS_FRESH_RUN?}. -if [ -d ${WORKSPACE_DIR:?} ]; then - IS_FRESH_RUN= -fi - -if [[ ${IS_FRESH_RUN?} ]]; then - rm -rf ${HOME:?}/.cache/bazel -fi - -mkdir -p ${WORKSPACE_DIR:?} -cd ${WORKSPACE_DIR:?} - -ENV_DIR=env -if [[ ${IS_FRESH_RUN?} ]]; then - python3 -m venv ${ENV_DIR:?} -fi -source ${ENV_DIR:?}/bin/activate - -# Download and build everything -if [[ ${IS_FRESH_RUN?} ]]; then - # Install deps - pip install --upgrade pip - - TIMESTAMP=$(date +%s) - # Clone repos first so that their HEAD is as close as possible to $TIMESTAMP. - git clone https://github.com/pytorch/pytorch.git - git clone https://github.com/pytorch/xla.git pytorch/xla - git clone https://github.com/pytorch/vision.git - git clone https://github.com/pytorch/audio.git - git clone https://github.com/pytorch/benchmark.git - - # Set up pytorch - cd pytorch - pip install -r requirements.txt - make triton - USE_CUDA=1 python setup.py develop - cd .. - - # Set up pytorch/xla - cd pytorch/xla - # Query local compute capability. If that fails, assign a sane default. - LOCAL_CAP=compute_$(nvidia-smi --query-gpu=compute_cap --format=csv | \ - tail -1 | sed 's/\.//g' | grep -E '^[0-9]{2}$' || echo '80') - python setup.py develop - cd ../.. - - # Set up torchbench deps. - cd vision - python setup.py develop - cd .. - cd audio - python setup.py develop - cd .. - - # Set up torchbench - cd benchmark - USE_CUDA=1 python install.py - cd .. - - # Apply local patches - cd benchmark - git apply ../pytorch/xla/benchmarks/patches/mismatched_batch_size.patch - cd .. -else - # Grab the timestamp from the first result, if it exists. - # Otherwise take the current timestamp. - TIMESTAMP=$(head -1 ${WORKSPACE_RESULTS_DIR:?}/results.jsonl | \ - sed -E 's|.*\"timestamp\": ([0-9.]+).*|\1|' | \ - grep -E '^[0-9.]+$' || date +%s) -fi - -# Stabilize clock freqs -sudo nvidia-smi --lock-gpu-clocks=1200,1200 - -# Note: this doesn't work on GCP because it's a VM. -# Moreover, we should look into disabling turbo boost if possible. -# sudo cpupower frequency-set --governor performance - -PROFILING_FLAGS= -if [[ ${ENABLE_PROFILING?} ]]; then - PROFILING_FLAGS="--dump-dynamo-counters \ - --collect-dynamo-counters \ - --dump-pytorch-profiles \ - --dump-pytorch-xla-metrics \ - --profile-cuda-cpu \ - --profile-cuda-cpu-individual-ops" -fi - -# Run the experiments -cd pytorch -# Note: to avoid running in Eager mode (i.e. --xla=None --dynamo=None), -# we split experiment_runner.py's invocation in two. -# -# Inference + Training: XLA Lazy tensors, XLA+XLA_Eval Dynamo. -python xla/benchmarks/experiment_runner.py \ - --test=eval --test=train \ - --xla=PJRT \ - --dynamo=None --dynamo=openxla \ - --suite-name=torchbench --accelerator=cuda \ - --output-dirname=${WORKSPACE_RESULTS_DIR:?} \ - --repeat=${REPEAT:?} --print-subprocess \ - --timestamp=${TIMESTAMP:?} ${PROFILING_FLAGS?} -# Inference + Training: Inductor Dynamo. -python xla/benchmarks/experiment_runner.py \ - --test=eval --test=train \ - --xla=None \ - --dynamo=inductor \ - --suite-name=torchbench --accelerator=cuda \ - --output-dirname=${WORKSPACE_RESULTS_DIR:?} \ - --repeat=${REPEAT:?} --print-subprocess \ - --timestamp=${TIMESTAMP:?} ${PROFILING_FLAGS?} -cd .. - -# Run Llama2 benchmarks. -python ${BM_DIR:?}/llama.py --workspace_dir=${WORKSPACE_DIR:?} - -# Gather results and generate reports -REPORTS_DIR=${NIGHTLY_RESULTS_DIR:?}/reports/${WORKSPACE:?} -mkdir -p ${REPORTS_DIR:?} -cp ${WORKSPACE_RESULTS_DIR:?}/results.jsonl \ - ${NIGHTLY_RESULTS_DIR:?}/${WORKSPACE:?}.jsonl - -PYTORCH_GIT_REV=$(git -C pytorch rev-parse --short HEAD) -XLA_GIT_TAG=$(git -C pytorch/xla describe --tags --always) -GIT_TAGS="PT: ${PYTORCH_GIT_REV:?} XLA: ${XLA_GIT_TAG:?}" - -COMMON_TITLE_PREFIX= -if [[ ${ENABLE_PROFILING?} ]]; then - COMMON_TITLE_PREFIX="[Profiling ON] " -fi - -INFERENCE_BACKENDS_CMD='--backends inductor openxla+dynamo openxla+lazytensor' -TRAINING_BACKENDS_CMD='--backends inductor openxla+dynamo openxla+lazytensor' - -# Skip result files coming from one-off runs. -INPUT_JSONL_FILES=$(ls ${NIGHTLY_RESULTS_DIR:?}/*.jsonl | \ - grep '[0-9]\+-[0-9]\+-[0-9]\+\.jsonl') - -for testname in inference training; do - for report in latest histogram speedup; do - for format in csv svg; do - for tier in '' 1; do - TITLE_PREFIX= - TIER_CMD= - TIER_FILE_SUFFIX= - if [[ ${tier?} ]]; then - TITLE_PREFIX="${COMMON_TITLE_PREFIX?}Tier${tier?} " - TIER_CMD=--filter-by-tier=${tier:?} - TIER_FILE_SUFFIX=-tier${tier:?} - fi - - TITLE="(${testname:?})" - WIDTH=9 - HEIGHT=7 - if [ "${report:?}" == "latest" ]; then - TITLE="${WORKSPACE:?} (${testname:?}) ${GIT_TAGS:?}" - if [[ -z ${tier?} ]]; then - WIDTH=15 - HEIGHT=8 - fi - fi - BACKENDS_CMD= - if [ "${testname:?}" = 'inference' ]; then - BACKENDS_CMD="${INFERENCE_BACKENDS_CMD:?}" - else - BACKENDS_CMD="${TRAINING_BACKENDS_CMD:?}" - fi - python ${BM_DIR:?}/aggregate.py --accelerator=${ACCELERATOR:?} \ - --report=${report:?} --test=${testname:?} --format=${format:?} \ - --title="${TITLE_PREFIX?}${TITLE:?}" \ - --fig-height=${HEIGHT:?} --fig-width=${WIDTH:?} \ - ${TIER_CMD?} \ - ${BACKENDS_CMD:?} -- \ - ${INPUT_JSONL_FILES:?} \ - > ${REPORTS_DIR:?}/${ACCELERATOR:?}-${testname:?}-${report:?}${TIER_FILE_SUFFIX?}.${format:?} - done - done - done -done - -# Generate Llama2 output. -for testname in inference; do - for report in latest_grouped; do - for format in csv svg tab; do - BACKENDS_CMD= - if [ "${testname:?}" = 'inference' ]; then - BACKENDS_CMD="${INFERENCE_BACKENDS_CMD:?}" - else - BACKENDS_CMD="${TRAINING_BACKENDS_CMD:?}" - fi - python ${BM_DIR:?}/aggregate.py --accelerator=${ACCELERATOR:?} \ - --report=${report:?} --test=${testname:?} --format=${format:?} \ - --title="${COMMON_TITLE_PREFIX?}Llama2 (${testname:?})" \ - --filter='^llama2\.' \ - ${BACKENDS_CMD:?} -- \ - ${INPUT_JSONL_FILES:?} \ - > ${REPORTS_DIR:?}/${ACCELERATOR:?}-${testname:?}-${report:?}-llama2.${format:?} - done - done -done diff --git a/benchmarks/result_analyzer.py b/benchmarks/result_analyzer.py index 69f6b323206..3da67fb7067 100644 --- a/benchmarks/result_analyzer.py +++ b/benchmarks/result_analyzer.py @@ -57,7 +57,6 @@ def run_csv(self): "xla_flags": pd.Series(dtype="str"), "dynamo": pd.Series(dtype="str"), "torch_xla2": pd.Series(dtype="str"), - "keep_model_data_on_cuda": pd.Series(dtype="bool"), "test": pd.Series(dtype="str"), "batch_size": pd.Series(dtype="int"), "repeat": pd.Series(dtype="int"), @@ -122,10 +121,6 @@ def extract_metrics_jsonl(self, file: str): dynamo_value = "None" if dynamo is None else dynamo torch_xla2 = dataline["experiment"]["torch_xla2"] torch_xla2_value = "None" if torch_xla2 is None else torch_xla2 - keep_model_data_on_cuda = dataline["experiment"][ - "keep_model_data_on_cuda"] - keep_model_data_on_cuda_value = "None" if keep_model_data_on_cuda is None else str( - keep_model_data_on_cuda) test = dataline["experiment"]["test"] test_value = "None" if test is None else test outputs_file = dataline["experiment"].get("outputs_file", None) @@ -146,7 +141,6 @@ def extract_metrics_jsonl(self, file: str): "xla": xla_value, "dynamo": dynamo_value, "torch_xla2": torch_xla2_value, - "keep_model_data_on_cuda": keep_model_data_on_cuda_value, "test": test_value, "outputs_file": outputs_file_value } @@ -180,38 +174,21 @@ def extract_metrics_csv(self, file: str, metric_df: Optional[pd.DataFrame]): timestamp = dataline[ "timestamp"] if "timestamp" in dataline else self.timestamp d = { - "timestamp": - timestamp, - "suite_name": - dataline["model"]["suite_name"], - "model_name": - dataline["model"]["model_name"], - "accelerator": - dataline["experiment"]["accelerator"], - "accelerator_model": - dataline["experiment"]["accelerator_model"], - "xla": - dataline["experiment"]["xla"], - "xla_flags": - dataline["experiment"]["xla_flags"], - "dynamo": - dataline["experiment"]["dynamo"], - "torch_xla2": - dataline["experiment"]["torch_xla2"], - "keep_model_data_on_cuda": - dataline["experiment"]["keep_model_data_on_cuda"], - "test": - dataline["experiment"]["test"], - "batch_size": - dataline["experiment"]["batch_size"], - "repeat": - dataline["repeat"], - "iterations_per_run": - dataline["iterations_per_run"], - "error_message": - None, - "outputs_file": - dataline["experiment"].get("outputs_file", ""), + "timestamp": timestamp, + "suite_name": dataline["model"]["suite_name"], + "model_name": dataline["model"]["model_name"], + "accelerator": dataline["experiment"]["accelerator"], + "accelerator_model": dataline["experiment"]["accelerator_model"], + "xla": dataline["experiment"]["xla"], + "xla_flags": dataline["experiment"]["xla_flags"], + "dynamo": dataline["experiment"]["dynamo"], + "torch_xla2": dataline["experiment"]["torch_xla2"], + "test": dataline["experiment"]["test"], + "batch_size": dataline["experiment"]["batch_size"], + "repeat": dataline["repeat"], + "iterations_per_run": dataline["iterations_per_run"], + "error_message": None, + "outputs_file": dataline["experiment"].get("outputs_file", ""), } if "error" in dataline["metrics"] and not self._args.hide_errors: diff --git a/benchmarks/run_benchmark.sh b/benchmarks/run_benchmark.sh deleted file mode 100644 index 79b746c10ad..00000000000 --- a/benchmarks/run_benchmark.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash -set -exo pipefail -CDIR="$(cd "$(dirname "$0")" ; pwd -P)" -LOGFILE=/tmp/benchmark_test.log - -# Note [Keep Going] -# -# Set the `CONTINUE_ON_ERROR` flag to `1` to make the CI tests continue on error. -# This will allow you to see all the failures on your PR, not stopping with the first -# test failure like the default behavior. -CONTINUE_ON_ERROR="${CONTINUE_ON_ERROR:-0}" -if [[ "$CONTINUE_ON_ERROR" == "1" ]]; then - set +e -fi - -TESTGPUVM=None -TESTTPUVM=None -# NUMBER=0 - -while getopts 'G:T:' OPTION # N: -do - case $OPTION in - G) - TESTGPUVM=$OPTARG - ;; - T) - TESTTPUVM=$OPTARG - ;; - # N) - # NUMBER=$OPTARG - # ;; - esac -done -shift $(($OPTIND - 1)) - -# func for test after ssh to VM, create container and execute in container -function benchmarking_in_container { - sudo docker pull gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8 - sudo apt-get install -y apt-transport-https ca-certificates curl gnupg-agent software-properties-common - nvidia-smi - distribution=$(. /etc/os-release;echo $ID$VERSION_ID) - curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - - curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list - sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit - sudo systemctl restart docker - sudo docker run --gpus all -it -d gcr.io/tpu-pytorch/xla:nightly_3.8_cuda_11.8 bin/bash - sudo docker exec -it $(sudo docker ps | awk 'NR==2 { print $1 }') /bin/bash - # install torchbench - cd ~ - git clone -b xla_benchmark https://github.com/pytorch/benchmark.git - cd benchmark - # install deps - pip install --pre torchvision torchaudio -i https://download.pytorch.org/whl/nightly/cu118 - # git clone xla - cd ~ - git clone -b benchmark https://github.com/pytorch/xla.git xla - cd ~/xla/benchmarks - # dry run - python3 experiment_runner.py --suite-name=torchbench --accelerator=gpu --progress-bar --dry-run - # run bechmark - python3 experiment_runner.py --suite-name=torchbench --accelerator=gpu --progress-bar - # analyze result to csv - python3 result_analyzer.py -} - - - -if TESTGPUVM='1A100': - # ssh to 1-A100 GPUVM and test in container - gcloud compute ssh a100-manfei-1 --zone us-central1-c --project tpu-prod-env-one-vm -- -o ProxyCommand='corp-ssh-helper %h %p' --command=benchmarking_in_container -elif TESTGPUVM='8A100': - # SSH TO 8-A100 GPUVM and test in container - gcloud compute ssh manfei-a100-8-new --zone us-central1-c --project tpu-prod-env-one-vm -- -o ProxyCommand='corp-ssh-helper %h %p' --command=benchmarking_in_container -elif TESTGPUVM='4H100': - # ssh to 4-H100 GPUVM and test in container -elif TESTTPUVM='v5e8': - # ssh to v5e-8 TPUVM and test in container -elif TESTTPUVM='v5p8': - # ssh to v5p-8 TPUVM and test in container diff --git a/benchmarks/run_single_graph_bm.sh b/benchmarks/run_single_graph_bm.sh deleted file mode 100755 index 98e10a06d05..00000000000 --- a/benchmarks/run_single_graph_bm.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -DATE=$(date +"%Y_%m_%d_%H_%M") - -OUT_PATH=xla/benchmarks/bm_results/single_graph/$DATE -mkdir -p $OUT_PATH - -python new_xla/benchmarks/experiment_runner.py \ - --dynamo=inductor --dynamo=openxla \ - --xla=None --xla=PJRT \ - --test=eval \ - --filter-by-single-graph \ - --pure-wall-time \ - --suite-name=torchbench \ - --accelerator=cuda \ - --output-dirname=$OUT_PATH \ - --repeat=5 \ - --print-subprocess \ - --no-resume \ - > $OUT_PATH/stdout.txt 2> $OUT_PATH/stderr.txt - -python3 xla/benchmarks/result_analyzer.py \ - --output-dirname=$OUT_PATH \ - --database=$OUT_PATH/$DATE.csv diff --git a/benchmarks/run_top_tier_bm.sh b/benchmarks/run_top_tier_bm.sh deleted file mode 100755 index 9b8e8eb8eb6..00000000000 --- a/benchmarks/run_top_tier_bm.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -set -ex - -DATE=$(date +"%Y_%m_%d_%H_%M") - -OUT_PATH=xla/benchmarks/bm_results/$DATE -mkdir -p $OUT_PATH - -python xla/benchmarks/experiment_runner.py \ - --dynamo=inductor --dynamo=openxla \ - --xla=None --xla=PJRT \ - --test=eval --test=train \ - --filter-by-tier=1 --filter-by-tier=2 --filter-by-tier=3 \ - --suite-name=torchbench \ - --accelerator=cuda \ - --output-dirname=$OUT_PATH \ - --repeat=5 \ - --print-subprocess \ - --no-resume \ - > $OUT_PATH/stdout.txt 2> $OUT_PATH/stderr.txt - -python3 xla/benchmarks/result_analyzer.py \ - --output-dirname=$OUT_PATH \ - --database=$OUT_PATH/$DATE.csv diff --git a/benchmarks/torchbench_model.py b/benchmarks/torchbench_model.py index 55b7f555276..75a64fa86fd 100644 --- a/benchmarks/torchbench_model.py +++ b/benchmarks/torchbench_model.py @@ -273,13 +273,10 @@ def set_up(self): # Move the initialized model to XLA device if it's not there already. if self.benchmark_experiment.xla and not self.should_initialize_on_xla(): - # First, move the model and the inputs to CPU. - # This avoids having dupplicated data on CUDA. - keep_model_data_on_cuda = self.benchmark_experiment.keep_model_data_on_cuda - if self.is_accelerator_cuda() and not keep_model_data_on_cuda: - self.module = self.module.to("cpu") - self.example_inputs = move_to_device(self.example_inputs, "cpu") - cleanup(self.is_accelerator_cuda()) + assert not self.is_accelerator_cuda() + self.module = self.module.to("cpu") + self.example_inputs = move_to_device(self.example_inputs, "cpu") + cleanup() # Torchbench has quite different setup for yolov3, so directly passing # the right example_inputs diff --git a/benchmarks/util.py b/benchmarks/util.py index bdd965a46a9..3c13232af2f 100644 --- a/benchmarks/util.py +++ b/benchmarks/util.py @@ -51,18 +51,9 @@ def deterministic_torch_manual_seed(*args, **kwargs): @functools.lru_cache(maxsize=3) def is_xla_device_available(devkind, use_xla2: bool = False): - if devkind not in ["CPU", "CUDA", "TPU"]: + if devkind not in ["CPU", "TPU"]: raise ValueError(devkind) # Checking the availability of a given device kind. - # - # We intentionally use subprocess instead of multiprocessing library. The - # reason being that we might initialize CUDA in the parent process and use - # CUDA in the child process. This is a known limitation of using CUDA and - # forking the process. - # - # In this case, subprocess works because it replaces the forked memory with - # the execution of the new program (fresh memory), avoiding the error. - # # For more information: https://github.com/pytorch/xla/pull/5960 CHECK_XLA_DEVICE_PY = "check_xla_device.py" python_file = os.path.join(os.path.dirname(__file__), CHECK_XLA_DEVICE_PY) @@ -80,7 +71,7 @@ def move_to_device(item, device, torch_xla2: bool = False): def move_to_device_func(tensor: torch.Tensor) -> torch.Tensor: # If `tensor` is an XLA tensor, first move it to CPU. We need to do - # that if we want to move the tensor to, say, CUDA. + # that if we want to move the tensor to TPU. if tensor.device.type == "xla": return tensor.cpu().to(device) return tensor.to(device) diff --git a/benchmarks/verifier.py b/benchmarks/verifier.py index d2e940711dd..4fefc509cc5 100644 --- a/benchmarks/verifier.py +++ b/benchmarks/verifier.py @@ -152,7 +152,7 @@ def maybe_synchronize(): # Delete the model for saving up memory. del model # Clean-up CUDA as well. - cleanup(cuda=True) + cleanup(cuda=experiment_config["accelerator"] == "cuda") def _apply_eager_config(experiment): diff --git a/test/benchmarks/test_benchmark_experiment.py b/test/benchmarks/test_benchmark_experiment.py index 2c5efcd0583..841beb519e0 100644 --- a/test/benchmarks/test_benchmark_experiment.py +++ b/test/benchmarks/test_benchmark_experiment.py @@ -7,16 +7,15 @@ class BenchmarkExperimentTest(unittest.TestCase): def test_to_dict(self): be = BenchmarkExperiment("cpu", "PJRT", "some xla_flags", "openxla", None, - False, "train", "123", False) + "train", "123", False) actual = be.to_dict() - self.assertEqual(10, len(actual)) + self.assertEqual(9, len(actual)) self.assertEqual("cpu", actual["accelerator"]) self.assertTrue("accelerator_model" in actual) self.assertEqual("PJRT", actual["xla"]) self.assertEqual("some xla_flags", actual["xla_flags"]) self.assertEqual("openxla", actual["dynamo"]) self.assertEqual(None, actual["torch_xla2"]) - self.assertEqual(False, actual["keep_model_data_on_cuda"]) self.assertEqual("train", actual["test"]) self.assertEqual("123", actual["batch_size"]) self.assertEqual(False, actual["enable_functionalization"]) diff --git a/test/benchmarks/test_experiment_runner.py b/test/benchmarks/test_experiment_runner.py index 4ce4167d0e4..e1c572e402f 100644 --- a/test/benchmarks/test_experiment_runner.py +++ b/test/benchmarks/test_experiment_runner.py @@ -29,44 +29,15 @@ def test_dummy_dry_run(self): expected_in_stderr = [ "Number of selected experiment configs: 4", "Number of selected model configs: 1", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}", ] for expected in expected_in_stderr: self.assertIn(expected, child.stderr) - @absltest.skipUnless(xr.device_type() in {'CUDA'}, 'Needs CUDA accelerator') - def test_dummy_dry_run_cuda(self): - child = subprocess.run([ - "python", - EXPERIMENT_RUNNER_PY, - "--dynamo=openxla", - "--dynamo=inductor", - "--xla=PJRT", - "--xla=None", - "--test=eval", - "--test=train", - "--suite-name=dummy", - "--accelerator=cuda", - "--dry-run", - ], - capture_output=True, - text=True) - expected_in_stderr = [ - "Number of selected experiment configs: 4", - "Number of selected model configs: 1", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - ] - for expected in expected_in_stderr: - self.assertIn(expected, child.stderr) - - @absltest.skipUnless(xr.device_type() in {'CUDA'}, 'Needs CUDA accelerator') - def test_dummy_dry_run_inductor_cuda(self): + def test_dummy_dry_run_inductor_cpu(self): child = subprocess.run([ "python", EXPERIMENT_RUNNER_PY, @@ -85,14 +56,13 @@ def test_dummy_dry_run_inductor_cuda(self): expected_in_stderr = [ "Number of selected experiment configs: 2", "Number of selected model configs: 1", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}", ] for expected in expected_in_stderr: self.assertIn(expected, child.stderr) - @absltest.skipUnless(xr.device_type() in {'CUDA'}, 'Needs CUDA accelerator') - def test_dummy_openxla_train_cuda(self): + def test_dummy_openxla_train_cpu(self): child = subprocess.run([ "python", EXPERIMENT_RUNNER_PY, @@ -103,7 +73,7 @@ def test_dummy_openxla_train_cuda(self): "--test=eval", "--test=train", "--suite-name=dummy", - "--accelerator=cuda", + "--accelerator=cpu", "--filter=^dummy$", "--dry-run", ], @@ -112,21 +82,20 @@ def test_dummy_openxla_train_cuda(self): expected_in_stderr = [ "Number of selected experiment configs: 4", "Number of selected model configs: 1", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}", ] for expected in expected_in_stderr: self.assertIn(expected, child.stderr) - @absltest.skipUnless(xr.device_type() in {'CUDA'}, 'Needs CUDA accelerator') - def test_dummy_dynamo_none_cuda(self): + def test_dummy_dynamo_none_cpu(self): child = subprocess.run([ "python", EXPERIMENT_RUNNER_PY, "--suite-name=dummy", - "--accelerator=cuda", + "--accelerator=cpu", "--xla=PJRT", "--xla=None", "--filter=^dummy$", @@ -137,39 +106,14 @@ def test_dummy_dynamo_none_cuda(self): expected_in_stderr = [ "Number of selected experiment configs: 8", "Number of selected model configs: 1", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": false}", - ] - for expected in expected_in_stderr: - self.assertIn(expected, child.stderr) - - @absltest.skipUnless(xr.device_type() in {'CUDA'}, 'Needs CUDA accelerator') - def test_dummy_dry_run_cuda_with_keep_model_data_on_cuda(self): - child = subprocess.run([ - "python", - EXPERIMENT_RUNNER_PY, - "--dynamo=openxla", - "--xla=PJRT", - "--test=eval", - "--test=train", - "--suite-name=dummy", - "--accelerator=cuda", - "--keep-model-data-on-cuda", - "--dry-run", - ], - capture_output=True, - text=True) - expected_in_stderr = [ - "Number of selected experiment configs: 2", - "Number of selected model configs: 1", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": true, \"enable_functionalization\": false}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cuda\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": true, \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": null, \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": false}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": null, \"xla_flags\": null, \"dynamo\": \"inductor\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": false}", ] for expected in expected_in_stderr: self.assertIn(expected, child.stderr) @@ -192,8 +136,8 @@ def test_dummy_dry_run_with_functionalization(self): expected_in_stderr = [ "Number of selected experiment configs: 2", "Number of selected model configs: 1", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": true}", - "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"keep_model_data_on_cuda\": false, \"enable_functionalization\": true}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"eval\", \"enable_functionalization\": true}", + "--model-config={\"model_name\": \"dummy\"} --experiment-config={\"accelerator\": \"cpu\", \"xla\": \"PJRT\", \"xla_flags\": null, \"dynamo\": \"openxla\", \"torch_xla2\": null, \"test\": \"train\", \"enable_functionalization\": true}", ] for expected in expected_in_stderr: self.assertIn(expected, child.stderr)