From 1fac9155ab5861f2bb5aae79c7f3dd9cc88c1ddf Mon Sep 17 00:00:00 2001 From: Matthew Beech Date: Fri, 21 Nov 2025 14:20:16 -0600 Subject: [PATCH] Removed temp model dir, reworked save-checkpoints --- silnlp/common/environment.py | 17 +++++++---------- silnlp/nmt/clearml_connection.py | 4 ---- silnlp/nmt/config_utils.py | 3 +-- silnlp/nmt/experiment.py | 11 +++-------- silnlp/nmt/hugging_face_config.py | 3 +-- silnlp/nmt/test.py | 4 +--- silnlp/nmt/translate.py | 2 -- 7 files changed, 13 insertions(+), 31 deletions(-) diff --git a/silnlp/common/environment.py b/silnlp/common/environment.py index 455ad64d..3384beb8 100644 --- a/silnlp/common/environment.py +++ b/silnlp/common/environment.py @@ -25,11 +25,11 @@ class SilNlpEnv: def __init__(self): + atexit.register(self.delete_path) atexit.register(check_transfers) self.root_dir = Path.home() / ".silnlp" self.assets_dir = Path(__file__).parent.parent / "assets" - self.temp_model_dir: Optional[Path] = None - + self.path_to_delete: Optional[Path] = None self.set_data_dir() def set_data_dir(self, data_dir: Optional[Path] = None): @@ -109,15 +109,12 @@ def resolve_data_dir(self) -> Path: raise FileExistsError("No valid path exists") - def get_temp_model_dir(self) -> Path: - if not self.temp_model_dir: - self.temp_model_dir = Path(tempfile.mkdtemp(prefix="silnlp_model_")) - atexit.register(self.delete_temp_model_dir) - return self.temp_model_dir + def delete_path_on_exit(self, path: Union[str, Path]) -> None: + self.path_to_delete = pathify(path) - def delete_temp_model_dir(self) -> None: - if self.temp_model_dir and self.temp_model_dir.is_dir(): - shutil.rmtree(self.temp_model_dir) + def delete_path(self) -> None: + if self.path_to_delete and self.path_to_delete.is_dir(): + shutil.rmtree(self.path_to_delete) def check_transfers() -> None: diff --git a/silnlp/nmt/clearml_connection.py b/silnlp/nmt/clearml_connection.py index 97188a0f..5d0e5bff 100644 --- a/silnlp/nmt/clearml_connection.py +++ b/silnlp/nmt/clearml_connection.py @@ -23,7 +23,6 @@ class SILClearML: experiment_suffix: str = "" clearml_project_folder: str = "" commit: Optional[str] = None - use_default_model_dir: bool = True tag: Optional[str] = None skip_config: bool = False @@ -89,7 +88,6 @@ def __post_init__(self) -> None: if self.commit: self.task.set_script(commit=self.commit) if self.queue_name.lower() not in ("local", "locally"): - SIL_NLP_ENV.delete_temp_model_dir() self.task.execute_remotely(queue_name=self.queue_name) except LoginError as e: if self.queue_name is None: @@ -131,7 +129,6 @@ def _load_config(self) -> None: config = yaml.safe_load(file) if config is None or len(config.keys()) == 0: raise RuntimeError("Config file has no contents.") - config["use_default_model_dir"] = self.use_default_model_dir self.config = create_config(exp_dir, config) return # There is a ClearML task - lets' do more complex importing. @@ -158,5 +155,4 @@ def _load_config(self) -> None: exp_dir.mkdir(parents=True, exist_ok=True) with (exp_dir / "config.yml").open("w+", encoding="utf-8") as file: yaml.safe_dump(data=config, stream=file) - config["use_default_model_dir"] = self.use_default_model_dir self.config = create_config(exp_dir, config) diff --git a/silnlp/nmt/config_utils.py b/silnlp/nmt/config_utils.py index f29bb3ea..994373fc 100644 --- a/silnlp/nmt/config_utils.py +++ b/silnlp/nmt/config_utils.py @@ -7,13 +7,12 @@ from .hugging_face_config import HuggingFaceConfig -def load_config(exp_name: str, use_default_model_dir: bool = True) -> Config: +def load_config(exp_name: str) -> Config: exp_dir = get_mt_exp_dir(exp_name) config_path = exp_dir / "config.yml" with config_path.open("r", encoding="utf-8") as file: config: dict = yaml.safe_load(file) - config["use_default_model_dir"] = use_default_model_dir return create_config(exp_dir, config) diff --git a/silnlp/nmt/experiment.py b/silnlp/nmt/experiment.py index 1c91e351..ead90ad5 100644 --- a/silnlp/nmt/experiment.py +++ b/silnlp/nmt/experiment.py @@ -23,7 +23,6 @@ class SILExperiment: mixed_precision: bool = True num_devices: int = 1 clearml_queue: Optional[str] = None - save_checkpoints: bool = False run_prep: bool = False run_train: bool = False run_test: bool = False @@ -40,7 +39,6 @@ def __post_init__(self): self.name, self.clearml_queue, commit=self.commit, - use_default_model_dir=self.save_checkpoints, tag=self.clearml_tag, ) self.name: str = self.clearml.name @@ -89,7 +87,6 @@ def test(self): scorers=self.scorers, produce_multiple_translations=self.produce_multiple_translations, save_confidences=self.save_confidences, - use_default_model_dir=self.save_checkpoints, ) def translate(self): @@ -104,7 +101,6 @@ def translate(self): translator = TranslationTask( name=self.name, checkpoint=checkpoint, - use_default_model_dir=self.save_checkpoints, commit=self.commit, ) @@ -253,9 +249,6 @@ def main() -> None: args.train = True args.test = True - if not args.train: - args.save_checkpoints = True - exp = SILExperiment( name=args.experiment, make_stats=args.stats, @@ -265,7 +258,6 @@ def main() -> None: clearml_queue=args.clearml_queue, clearml_tag=args.clearml_tag, commit=args.commit, - save_checkpoints=args.save_checkpoints, run_prep=args.preprocess, run_train=args.train, run_test=args.test, @@ -275,6 +267,9 @@ def main() -> None: scorers=set(s.lower() for s in args.scorers), score_by_book=args.score_by_book, ) + + if not args.save_checkpoints: + SIL_NLP_ENV.delete_path_on_exit(get_mt_exp_dir(args.experiment) / "run") exp.run() diff --git a/silnlp/nmt/hugging_face_config.py b/silnlp/nmt/hugging_face_config.py index 59225275..e5e358e9 100644 --- a/silnlp/nmt/hugging_face_config.py +++ b/silnlp/nmt/hugging_face_config.py @@ -314,7 +314,6 @@ def get_parent_model_name(parent_exp: str) -> str: class HuggingFaceConfig(Config): def __init__(self, exp_dir: Path, config: dict) -> None: - ckpt_dir = str(exp_dir / "run") if config["use_default_model_dir"] else SIL_NLP_ENV.get_temp_model_dir() config = merge_dict( { "data": { @@ -340,7 +339,7 @@ def __init__(self, exp_dir: Path, config: dict) -> None: "auto_grad_acc": False, "max_steps": 5000, "group_by_length": True, - "output_dir": ckpt_dir, + "output_dir": str(exp_dir / "run"), "delete_checkpoint_optimizer_state": True, "delete_checkpoint_tokenizer": True, "log_level": "info", diff --git a/silnlp/nmt/test.py b/silnlp/nmt/test.py index b6c1405b..74a66037 100644 --- a/silnlp/nmt/test.py +++ b/silnlp/nmt/test.py @@ -664,10 +664,9 @@ def test( by_book: bool = False, produce_multiple_translations: bool = False, save_confidences: bool = False, - use_default_model_dir: bool = True, ): exp_name = experiment - config = load_config(exp_name, use_default_model_dir) + config = load_config(exp_name) if not any(config.exp_dir.glob("test*.src.txt")): LOGGER.info("No test dataset.") @@ -885,7 +884,6 @@ def main() -> None: by_book=args.by_book, produce_multiple_translations=args.multiple_translations, save_confidences=args.save_confidences, - use_default_model_dir=True, ) diff --git a/silnlp/nmt/translate.py b/silnlp/nmt/translate.py index a9e73d33..2d936616 100644 --- a/silnlp/nmt/translate.py +++ b/silnlp/nmt/translate.py @@ -47,7 +47,6 @@ def __exit__( class TranslationTask: name: str checkpoint: Union[str, int] = "last" - use_default_model_dir: bool = True clearml_queue: Optional[str] = None commit: Optional[str] = None clearml_tag: Optional[str] = None @@ -284,7 +283,6 @@ def _init_translation_task(self, experiment_suffix: str) -> Tuple[Translator, Co project_suffix="_infer", experiment_suffix=experiment_suffix, commit=self.commit, - use_default_model_dir=self.use_default_model_dir, tag=self.clearml_tag, ) self.name = clearml.name