Skip to content

Commit 86636ba

Browse files
authored
benchmark/experiment_runner: save accelerator_model on error (#6218)
This commit can be seen as a partial revert of "63455e0cd Unify the way in which result files are dumped (#6162)". In that commit we missed that `experiment_cfg` does not have the `accelerator_model` record. Thus, when a benchmark fails we do not include that record in the JSONL file, and therefore resuming a run doesn't work because the failing entry is not recognized (note that when checking whether to resume we compare the JSONL entry against `benchmark_experiment`, which does have `accelerator_model`). We could fix this two ways: (1) always save `benchmark_experiment`, not only on success, or (2) add `accelerator_model` to experiment_config. I've chosen to go with (1) since that's what we were doing before 63455e0.
1 parent dc8948d commit 86636ba

File tree

2 files changed

+11
-8
lines changed

2 files changed

+11
-8
lines changed

benchmarks/benchmark_experiment.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ def list_experiment_configs(self):
2525
"test": ["eval", "train"],
2626
}
2727

28-
# Apply command line chocies.
28+
# Apply command line choices.
2929
if self._args.accelerator:
3030
config_choices["accelerator"] = list(set(self._args.accelerator))
3131
if self._args.xla:

benchmarks/experiment_runner.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ def generate_and_run_all_configs(self):
115115
if not self.model_loader.is_compatible(benchmark_model,
116116
benchmark_experiment):
117117
logger.warning("SKIP incompatible model and experiment configs.")
118-
self._save_results(experiment_cfg, model_cfg, {"error": "SKIP"})
118+
self._save_results(benchmark_experiment.to_dict(),
119+
benchmark_model.to_dict(), {"error": "SKIP"})
119120
continue
120121

121122
# Compose child process environment.
@@ -159,17 +160,21 @@ def generate_and_run_all_configs(self):
159160
except subprocess.TimeoutExpired as e:
160161
self._fwd_captured_stdout_stderr(e.stdout, e.stderr)
161162
logger.error("TIMEOUT")
162-
self._save_results(experiment_cfg, model_cfg, {"error": str(e)})
163+
self._save_results(benchmark_experiment.to_dict(),
164+
benchmark_model.to_dict(), {"error": str(e)})
163165
except subprocess.CalledProcessError as e:
164166
self._fwd_captured_stdout_stderr(e.stdout, e.stderr)
165167
logger.error("ERROR in subprocess")
166-
self._save_results(experiment_cfg, model_cfg, {"error": e.stderr})
168+
self._save_results(benchmark_experiment.to_dict(),
169+
benchmark_model.to_dict(), {"error": e.stderr})
167170
except subprocess.SubprocessError as e:
168171
logger.error("ERROR when launching child process")
169-
self._save_results(experiment_cfg, model_cfg, {"error": str(e)})
172+
self._save_results(benchmark_experiment.to_dict(),
173+
benchmark_model.to_dict(), {"error": str(e)})
170174
except ValueError as e:
171175
logger.error(f"ERROR {e}")
172-
self._save_results(experiment_cfg, model_cfg, {"error": str(e)})
176+
self._save_results(benchmark_experiment.to_dict(),
177+
benchmark_model.to_dict(), {"error": str(e)})
173178

174179
# TODO: Use `_unique_basename` instead.
175180
def _get_config_fingerprint(self, experiment_config: OrderedDict,
@@ -212,8 +217,6 @@ def run_single_config(self):
212217
accumulated_metrics[k] = []
213218
accumulated_metrics[k].append(v)
214219

215-
# TODO: Use `experiment_config` and `model_config` when env vars are no
216-
# longer included.
217220
self._save_results(benchmark_experiment.to_dict(),
218221
benchmark_model.to_dict(), accumulated_metrics)
219222

0 commit comments

Comments
 (0)