From 795dc5748f750a7d633a9c57712de942be4365f8 Mon Sep 17 00:00:00 2001 From: Emilio Cota Date: Tue, 19 Dec 2023 23:20:35 -0500 Subject: [PATCH] benchmark/experiment_runner: save accelerator_model on error This commit can be seen as a partial revert of "63455e0cd Unify the way in which result files are dumped (#6162)". In that commit we missed that `experiment_cfg` does not have the `accelerator_model` record. Thus, when a benchmark fails we do not include that record in the JSONL file, and therefore resuming a run doesn't work because the failing entry is not recognized (note that when checking whether to resume we compare the JSONL entry against `benchmark_experiment`, which does have `accelerator_model`). We could fix this two ways: (1) always save `benchmark_experiment`, not only on success, or (2) add `accelerator_model` to experiment_config. I've chosen to go with (1) since that's what we were doing before 63455e0cd. --- benchmarks/benchmark_experiment.py | 2 +- benchmarks/experiment_runner.py | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/benchmarks/benchmark_experiment.py b/benchmarks/benchmark_experiment.py index e32fc4d633c4..2b48c7e03de4 100644 --- a/benchmarks/benchmark_experiment.py +++ b/benchmarks/benchmark_experiment.py @@ -25,7 +25,7 @@ def list_experiment_configs(self): "test": ["eval", "train"], } - # Apply command line chocies. + # Apply command line choices. if self._args.accelerator: config_choices["accelerator"] = list(set(self._args.accelerator)) if self._args.xla: diff --git a/benchmarks/experiment_runner.py b/benchmarks/experiment_runner.py index 2fb618b72d50..54ed8a5bc130 100644 --- a/benchmarks/experiment_runner.py +++ b/benchmarks/experiment_runner.py @@ -113,7 +113,8 @@ def generate_and_run_all_configs(self): if not self.model_loader.is_compatible(benchmark_model, benchmark_experiment): logger.warning("SKIP incompatible model and experiment configs.") - self._save_results(experiment_cfg, model_cfg, {"error": "SKIP"}) + self._save_results(benchmark_experiment.to_dict(), + benchmark_model.to_dict(), {"error": "SKIP"}) continue # Compose child process environment. @@ -157,17 +158,21 @@ def generate_and_run_all_configs(self): except subprocess.TimeoutExpired as e: self._fwd_captured_stdout_stderr(e.stdout, e.stderr) logger.error("TIMEOUT") - self._save_results(experiment_cfg, model_cfg, {"error": str(e)}) + self._save_results(benchmark_experiment.to_dict(), + benchmark_model.to_dict(), {"error": str(e)}) except subprocess.CalledProcessError as e: self._fwd_captured_stdout_stderr(e.stdout, e.stderr) logger.error("ERROR in subprocess") - self._save_results(experiment_cfg, model_cfg, {"error": e.stderr}) + self._save_results(benchmark_experiment.to_dict(), + benchmark_model.to_dict(), {"error": e.stderr}) except subprocess.SubprocessError as e: logger.error("ERROR when launching child process") - self._save_results(experiment_cfg, model_cfg, {"error": str(e)}) + self._save_results(benchmark_experiment.to_dict(), + benchmark_model.to_dict(), {"error": str(e)}) except ValueError as e: logger.error(f"ERROR {e}") - self._save_results(experiment_cfg, model_cfg, {"error": str(e)}) + self._save_results(benchmark_experiment.to_dict(), + benchmark_model.to_dict(), {"error": str(e)}) # TODO: Use `_unique_basename` instead. def _get_config_fingerprint(self, experiment_config: OrderedDict, @@ -210,8 +215,6 @@ def run_single_config(self): accumulated_metrics[k] = [] accumulated_metrics[k].append(v) - # TODO: Use `experiment_config` and `model_config` when env vars are no - # longer included. self._save_results(benchmark_experiment.to_dict(), benchmark_model.to_dict(), accumulated_metrics)