pykeen · lvermue · Dec 7, 2020 · Nov 1, 2020 · Nov 1, 2020 · Nov 1, 2020
diff --git a/src/pykeen/pipeline.py b/src/pykeen/pipeline.py
@@ -699,6 +699,34 @@ def pipeline_from_config(
     )
 
 
+def save_pipeline_checkpoint_helper_file(path: str, random_seed: int) -> None:
+    """Save the pipeline checkpoint helper file.
+
+    :param path:
+        Save the state of the pipeline.
+    :param random_seed:
+        The random_seed that was used for the pipeline.
+    """
+    torch.save(
+        {
+            'random_seed': random_seed,
+        },
+        path,
+    )
+
+
+def load_pipeline_checkpoint_helper_file(path: str) -> Mapping[str, Any]:
+    """Load the pipeline checkpoint helper file.
+
+    :param path:
+        Save the state of the pipeline.
+
+    :return:
+        The pipeline checkpoint helper file dictionary loaded from the pipeline helper file.
+    """
+    return torch.load(path)
+
+
 def pipeline(  # noqa: C901
     *,
     # 1. Dataset
@@ -823,9 +851,22 @@ def pipeline(  # noqa: C901
     :param use_testing_data:
         If true, use the testing triples. Otherwise, use the validation triples. Defaults to true - use testing triples.
     """
-    if random_seed is None:
-        random_seed = random_non_negative_int()
-        logger.warning(f'No random seed is specified. Setting to {random_seed}.')
+    # To allow resuming training from a checkpoint when using a pipeline, the pipeline needs to store a helper file
+    # containing the used random_seed to ensure reproducible results
+    if training_kwargs.get('checkpoint_file'):
+        checkpoint_file = training_kwargs.get('checkpoint_file')
+        pipeline_checkpoint_helper_file = f"{checkpoint_file}_pipeline_helper_file"
+        if os.path.isfile(pipeline_checkpoint_helper_file):
+            pipeline_checkpoint_helper_dict = load_pipeline_checkpoint_helper_file(pipeline_checkpoint_helper_file)
+            random_seed = pipeline_checkpoint_helper_dict['random_seed']
+            logger.info(f'Loaded random seed {random_seed} from checkpoint.')
+        else:
+            logger.info(f"=> no pipeline checkpoint helper file found at '{checkpoint_file}'. Creating a new file.")
+            if random_seed is None:
+                random_seed = random_non_negative_int()
+                logger.warning(f'No random seed is specified. Setting to {random_seed}.')
+            save_pipeline_checkpoint_helper_file(path=pipeline_checkpoint_helper_file, random_seed=random_seed)
+
     set_random_seed(random_seed)
 
     result_tracker_cls: Type[ResultTracker] = get_result_tracker_cls(result_tracker)

diff --git a/src/pykeen/stoppers/early_stopping.py b/src/pykeen/stoppers/early_stopping.py
@@ -191,3 +191,26 @@ def get_summary_dict(self) -> Mapping[str, Any]:
             best_epoch=self.best_epoch,
             best_metric=self.best_metric,
         )
+
+    def _write_from_summary_dict(
+        self,
+        frequency: int,
+        patience: int,
+        relative_delta: float,
+        metric: str,
+        larger_is_better: bool,
+        results: List[float],
+        stopped: bool,
+        best_epoch: int,
+        best_metric: float,
+    ) -> None:
+        """Write attributes to stopper from a summary dict."""
+        self.frequency = frequency
+        self.patience = patience
+        self.relative_delta = relative_delta
+        self.metric = metric
+        self.larger_is_better = larger_is_better
+        self.results = results
+        self.stopped = stopped
+        self.best_epoch = best_epoch
+        self.best_metric = best_metric
diff --git a/src/pykeen/stoppers/stopper.py b/src/pykeen/stoppers/stopper.py
@@ -3,6 +3,7 @@
 """Basic stoppers."""
 
 from abc import ABC, abstractmethod
+from typing import Any, Mapping
 
 __all__ = [
     'Stopper',
@@ -25,6 +26,11 @@ def should_stop(self, epoch: int) -> bool:
         """Validate on validation set and check for termination condition."""
         raise NotImplementedError
 
+    @abstractmethod
+    def get_summary_dict(self) -> Mapping[str, Any]:
+        """Get a summary dict."""
+        raise NotImplementedError
+
 
 class NopStopper(Stopper):
     """A stopper that does nothing."""
@@ -36,3 +42,7 @@ def should_evaluate(self, epoch: int) -> bool:
     def should_stop(self, epoch: int) -> bool:
         """Return false; should never stop."""
         return False
+
+    def get_summary_dict(self) -> Mapping[str, Any]:
+        """Return empty mapping, doesn't have any attributes."""
+        return dict()
diff --git a/src/pykeen/training/training_loop.py b/src/pykeen/training/training_loop.py
@@ -4,7 +4,10 @@
 
 import gc
 import logging
+import os
+import time
 from abc import ABC, abstractmethod
+from hashlib import md5
 from typing import Any, List, Mapping, Optional, Tuple, Type, Union
 
 import torch
@@ -103,6 +106,10 @@ def __init__(
         else:
             self._loss_helper = self._label_loss_helper
 
+        # The internal epoch state tracks the last finished epoch of the training loop to allow for
+        # seamless loading and saving of training checkpoints
+        self._epoch = 0
+
     @classmethod
     def get_normalized_name(cls) -> str:
         """Get the normalized name of the training loop."""
@@ -118,6 +125,14 @@ def device(self):  # noqa: D401
         """The device used by the model."""
         return self.model.device
 
+    @property
+    def checksum(self) -> str:  # noqa: D401
+        """The checksum of the model and optimizer the training loop was configured with."""
+        h = md5()
+        h.update(str(self.model).encode('utf-8'))
+        h.update(str(self.optimizer).encode('utf-8'))
+        return h.hexdigest()
+
     def train(
         self,
         num_epochs: int = 1,
@@ -135,6 +150,8 @@ def train(
         sub_batch_size: Optional[int] = None,
         num_workers: Optional[int] = None,
         clear_optimizer: bool = False,
+        checkpoint_file: Optional[str] = None,
+        checkpoint_frequency: Optional[int] = None,
     ) -> List[float]:
         """Train the KGE model.
 
@@ -168,6 +185,11 @@ def train(
         :param clear_optimizer:
             Whether to delete the optimizer instance after training (as the optimizer might have additional memory
             consumption due to e.g. moments in Adam).
+        :param checkpoint_file:
+            The filename for saving checkpoints. If the given filename exists already, that file will be loaded and used
+            to continue training.
+        :param checkpoint_frequency:
+            The frequency of saving checkpoints in minutes.
 
         :return:
             A pair of the KGE model and the losses per epoch.
@@ -179,22 +201,39 @@ def train(
         # In some cases, e.g. using Optuna for HPO, the cuda cache from a previous run is not cleared
         torch.cuda.empty_cache()
 
-        result = self._train(
-            num_epochs=num_epochs,
-            batch_size=batch_size,
-            slice_size=slice_size,
-            label_smoothing=label_smoothing,
-            sampler=sampler,
-            continue_training=continue_training,
-            only_size_probing=only_size_probing,
-            use_tqdm=use_tqdm,
-            use_tqdm_batch=use_tqdm_batch,
-            tqdm_kwargs=tqdm_kwargs,
-            stopper=stopper,
-            result_tracker=result_tracker,
-            sub_batch_size=sub_batch_size,
-            num_workers=num_workers,
-        )
+        # If a checkpoint file is given we check whether it exists already and load it, if it does
+        if checkpoint_file:
+            if os.path.isfile(checkpoint_file):
+                stopper_dict = self._load_state(path=checkpoint_file)
+                # If the stopper dict has any keys, those are written back to the stopper
+                if stopper_dict:
+                    stopper._write_from_summary_dict(**stopper_dict)
+                continue_training = True
+            else:
+                logger.info(f"=> no checkpoint found at '{checkpoint_file}'. Creating a new file.")
+
+        # If the stopper loaded from the training loop checkpoint stopped the training, we return those results
+        if getattr(stopper, 'stopped', False):
+            result = self.losses_per_epochs
+        else:
+            result = self._train(
+                num_epochs=num_epochs,
+                batch_size=batch_size,
+                slice_size=slice_size,
+                label_smoothing=label_smoothing,
+                sampler=sampler,
+                continue_training=continue_training,
+                only_size_probing=only_size_probing,
+                use_tqdm=use_tqdm,
+                use_tqdm_batch=use_tqdm_batch,
+                tqdm_kwargs=tqdm_kwargs,
+                stopper=stopper,
+                result_tracker=result_tracker,
+                sub_batch_size=sub_batch_size,
+                num_workers=num_workers,
+                checkpoint_file=checkpoint_file,
+                checkpoint_frequency=checkpoint_frequency,
+            )
 
         # Ensure the release of memory
         torch.cuda.empty_cache()
@@ -221,6 +260,8 @@ def _train(  # noqa: C901
         result_tracker: Optional[ResultTracker] = None,
         sub_batch_size: Optional[int] = None,
         num_workers: Optional[int] = None,
+        checkpoint_file: Optional[str] = None,
+        checkpoint_frequency: int = None,
     ) -> List[float]:
         """Train the KGE model.
 
@@ -255,6 +296,10 @@ def _train(  # noqa: C901
             If provided split each batch into sub-batches to avoid memory issues for large models / small GPUs.
         :param num_workers:
             The number of child CPU workers used for loading data. If None, data are loaded in the main process.
+        :param checkpoint_file:
+            The filename for saving checkpoints.
+        :param checkpoint_frequency:
+            The frequency of saving checkpoints in minutes. Setting it to 0 will save a checkpoint after every epoch.
 
         :return:
             A pair of the KGE model and the losses per epoch.
@@ -322,9 +367,11 @@ def _train(  # noqa: C901
             _tqdm_kwargs = dict(desc=f'Training epochs on {self.device}', unit='epoch')
             if tqdm_kwargs is not None:
                 _tqdm_kwargs.update(tqdm_kwargs)
-            epochs = trange(1, 1 + num_epochs, **_tqdm_kwargs)
-        else:
+            epochs = trange(self._epoch + 1, 1 + num_epochs, **_tqdm_kwargs, initial=self._epoch, total=num_epochs)
+        elif only_size_probing:
             epochs = range(1, 1 + num_epochs)
+        else:
+            epochs = range(self._epoch + 1, 1 + num_epochs)
 
         logger.debug(f'using stopper: {stopper}')
 
@@ -336,6 +383,11 @@ def _train(  # noqa: C901
             num_workers=num_workers,
         )
 
+        # Save the time to track when the saved point was available
+        last_checkpoint = time.time()
+        if checkpoint_frequency is None:
+            checkpoint_frequency = 30
+
         # Training Loop
         for epoch in epochs:
             # Enforce training mode
@@ -412,8 +464,23 @@ def _train(  # noqa: C901
                 'prev_loss': self.losses_per_epochs[-2] if epoch > 2 else float('nan'),
             })
 
+            # Save the last successful finished epoch
+            self._epoch = epoch
+
             if stopper is not None and stopper.should_evaluate(epoch) and stopper.should_stop(epoch):
+                # If a checkpoint file is given, we check whether it is time to save a checkpoint
+                if checkpoint_file:
+                    minutes_since_last_checkpoint = (time.time() - last_checkpoint) // 60
+                    if minutes_since_last_checkpoint >= checkpoint_frequency:
+                        self._save_state(path=checkpoint_file, stopper=stopper)
                 return self.losses_per_epochs
+            else:
+                # If a checkpoint file is given, we check whether it is time to save a checkpoint
+                if checkpoint_file:
+                    minutes_since_last_checkpoint = (time.time() - last_checkpoint) // 60
+                    if minutes_since_last_checkpoint >= checkpoint_frequency:
+                        self._save_state(path=checkpoint_file, stopper=stopper)
+                        last_checkpoint = time.time()
 
         return self.losses_per_epochs
 
@@ -673,3 +740,62 @@ def _free_graph_and_cache(self):
         self.model.regularizer.reset()
         # The cache of the previous run has to be freed to allow accurate memory availability estimates
         torch.cuda.empty_cache()
+
+    def _save_state(self, path: str, stopper: Optional[Stopper] = None) -> None:
+        """Save the state of the training loop.
+
+        :param path:
+            Path of the file where to store the state in.
+        :param stopper:
+            An instance of :class:`pykeen.stopper.EarlyStopper` with settings for checking
+            if training should stop early
+        """
+        logger.debug("=> Saving checkpoint.")
+
+        if stopper is None:
+            stopper_dict = dict()
+        else:
+            stopper_dict = stopper.get_summary_dict()
+
+        torch.save(
+            {
+                'epoch': self._epoch,
+                'loss': self.losses_per_epochs,
+                'model_state_dict': self.model.state_dict(),
+                'optimizer_state_dict': self.optimizer.state_dict(),
+                'checksum': self.checksum,
+                'stopper_dict': stopper_dict,
+            },
+            path,
+        )
+        logger.info(f"=> Saved checkpoint after having finished epoch {self._epoch}.")
+
+    def _load_state(self, path: str) -> Mapping[str, Any]:
+        """Load the state of the training loop from a checkpoint.
+
+        :param path:
+            Path of the file where to load the state from.
+
+        :return:
+            The summary dict of the stopper at the time of saving the checkpoint.
+
+        :raises FileExistsError:
+            If the given checkpoint file has a non-matching checksum, i.e. it was saved with a different configuration.
+        """
+        logger.info(f"=> loading checkpoint '{path}'")
+        checkpoint = torch.load(path)
+        loaded_checksum = checkpoint['checksum']
+        if loaded_checksum == self.checksum:
+            self._epoch = checkpoint['epoch']
+            self.losses_per_epochs = checkpoint['loss']
+            self.model.load_state_dict(checkpoint['model_state_dict'])
+            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+            stopper_dict = checkpoint['stopper_dict']
+            logger.info(f"=> loaded checkpoint '{path}' stopped after having finished epoch {checkpoint['epoch']}")
+        else:
+            raise FileExistsError(
+                f"The checkpoint file '{path}' that was provided already exists, but seems to be "
+                "from a different training loop setup.",
+            )
+
+        return stopper_dict