pykeen · lvermue · Dec 7, 2020 · Nov 1, 2020 · Nov 1, 2020 · Nov 1, 2020
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -11,6 +11,7 @@ PyKEEN
    tutorial/first_steps
    tutorial/understanding_evaluation
    tutorial/translational_toy_example
+   tutorial/checkpoints
    tutorial/running_hpo
    tutorial/running_ablation
    tutorial/byod

diff --git a/docs/source/tutorial/checkpoints.rst b/docs/source/tutorial/checkpoints.rst
@@ -0,0 +1,9 @@
+Using Checkpoints
+=================
+Why does someone want to use checkpoints?
+
+Give an example of a run that will obviously crash
+
+How to recover when you were smart enough to keep checkpoints?
+
+Where is this applicable? pipeline / hpo pipeline?
diff --git a/src/pykeen/constants.py b/src/pykeen/constants.py
@@ -3,9 +3,15 @@
 """Constants for PyKEEN."""
 
 import os
+import pathlib
 
 __all__ = [
     'PYKEEN_HOME',
+    'PYKEEN_DEFAULT_CHECKPOINT_DIR',
 ]
 
 PYKEEN_HOME = os.environ.get('PYKEEN_HOME') or os.path.join(os.path.expanduser('~'), '.pykeen')
+PYKEEN_DEFAULT_CHECKPOINT = "PyKEEN_just_saved_my_day.pt"
+
+PYKEEN_DEFAULT_CHECKPOINT_DIR = pathlib.Path(PYKEEN_HOME).joinpath("checkpoints")
+PYKEEN_DEFAULT_CHECKPOINT_DIR.mkdir(exist_ok=True, parents=True)
diff --git a/src/pykeen/models/base.py b/src/pykeen/models/base.py
@@ -265,8 +265,10 @@ def __init__(
         # Random seeds have to set before the embeddings are initialized
         if random_seed is None:
             logger.warning('No random seed is specified. This may lead to non-reproducible results.')
+            self._random_seed = None
         elif random_seed is not NoRandomSeedNecessary:
             set_random_seed(random_seed)
+            self._random_seed = random_seed
 
         if automatic_memory_optimization is None:
             automatic_memory_optimization = True

diff --git a/src/pykeen/pipeline.py b/src/pykeen/pipeline.py
@@ -168,6 +168,7 @@
 import json
 import logging
 import os
+import pathlib
 import time
 from dataclasses import dataclass, field
 from typing import Any, Collection, Dict, Iterable, List, Mapping, Optional, Set, Type, Union
@@ -176,6 +177,7 @@
 import torch
 from torch.optim.optimizer import Optimizer
 
+from .constants import PYKEEN_DEFAULT_CHECKPOINT_DIR
 from .datasets import get_dataset
 from .datasets.base import DataSet
 from .evaluation import Evaluator, MetricResults, get_evaluator_cls
@@ -823,7 +825,28 @@ def pipeline(  # noqa: C901
     :param use_testing_data:
         If true, use the testing triples. Otherwise, use the validation triples. Defaults to true - use testing triples.
     """
-    if random_seed is None:
+    if training_kwargs is None:
+        training_kwargs = {}
+
+    # To allow resuming training from a checkpoint when using a pipeline, the pipeline needs to obtain the
+    # used random_seed to ensure reproducible results
+    checkpoint_file_name = training_kwargs.get('checkpoint_file')
+    if checkpoint_file_name is not None:
+        checkpoint_directory = pathlib.Path(training_kwargs.get('checkpoint_root', PYKEEN_DEFAULT_CHECKPOINT_DIR))
+        checkpoint_directory.mkdir(parents=True, exist_ok=True)
+        checkpoint_path = checkpoint_directory / checkpoint_file_name
+        if checkpoint_path.is_file():
+            checkpoint_dict = torch.load(checkpoint_path)
+            random_seed = checkpoint_dict['random_seed']
+            logger.info('loaded random seed %s from checkpoint.', random_seed)
+            # We have to set clear optimizer to False since training should be continued
+            clear_optimizer = False
+        else:
+            logger.info(f"=> no training loop checkpoint file found at '{checkpoint_path}'. Creating a new file.")
+            if random_seed is None:
+                random_seed = random_non_negative_int()
+                logger.warning(f'No random seed is specified. Setting to {random_seed}.')
+    elif random_seed is None:
         random_seed = random_non_negative_int()
         logger.warning(f'No random seed is specified. Setting to {random_seed}.')
     set_random_seed(random_seed)
@@ -939,9 +962,6 @@ def pipeline(  # noqa: C901
     if evaluation_kwargs is None:
         evaluation_kwargs = {}
 
-    if training_kwargs is None:
-        training_kwargs = {}
-
     # Stopping
     if 'stopper' in training_kwargs and stopper is not None:
         raise ValueError('Specified stopper in training_kwargs and as stopper')

diff --git a/src/pykeen/stoppers/early_stopping.py b/src/pykeen/stoppers/early_stopping.py
@@ -191,3 +191,26 @@ def get_summary_dict(self) -> Mapping[str, Any]:
             best_epoch=self.best_epoch,
             best_metric=self.best_metric,
         )
+
+    def _write_from_summary_dict(
+        self,
+        frequency: int,
+        patience: int,
+        relative_delta: float,
+        metric: str,
+        larger_is_better: bool,
+        results: List[float],
+        stopped: bool,
+        best_epoch: int,
+        best_metric: float,
+    ) -> None:
+        """Write attributes to stopper from a summary dict."""
+        self.frequency = frequency
+        self.patience = patience
+        self.relative_delta = relative_delta
+        self.metric = metric
+        self.larger_is_better = larger_is_better
+        self.results = results
+        self.stopped = stopped
+        self.best_epoch = best_epoch
+        self.best_metric = best_metric
diff --git a/src/pykeen/stoppers/stopper.py b/src/pykeen/stoppers/stopper.py
@@ -2,13 +2,20 @@
 
 """Basic stoppers."""
 
+import logging
+import pathlib
 from abc import ABC, abstractmethod
+from typing import Any, Mapping, Union
+
+import torch
 
 __all__ = [
     'Stopper',
     'NopStopper',
 ]
 
+logger = logging.getLogger(__name__)
+
 
 class Stopper(ABC):
     """A harness for stopping training."""
@@ -25,6 +32,29 @@ def should_stop(self, epoch: int) -> bool:
         """Validate on validation set and check for termination condition."""
         raise NotImplementedError
 
+    @abstractmethod
+    def get_summary_dict(self) -> Mapping[str, Any]:
+        """Get a summary dict."""
+        raise NotImplementedError
+
+    def _write_from_summary_dict(self, **kwargs):
+        pass
+
+    @staticmethod
+    def load_summary_dict_from_training_loop_checkpoint(path: Union[str, pathlib.Path]) -> Mapping[str, Any]:
+        """Load the summary dict from a training loop checkpoint.
+
+        :param path:
+            Path of the file where to store the state in.
+
+        :return:
+            The summary dict of the stopper at the time of saving the checkpoint.
+        """
+        logger.info(f"=> loading stopper summary dict from training loop checkpoint in '{path}'")
+        checkpoint = torch.load(path)
+        logger.info(f"=> loaded stopper summary dictionary from checkpoint in '{path}'")
+        return checkpoint['stopper_dict']
+
 
 class NopStopper(Stopper):
     """A stopper that does nothing."""
@@ -36,3 +66,7 @@ def should_evaluate(self, epoch: int) -> bool:
     def should_stop(self, epoch: int) -> bool:
         """Return false; should never stop."""
         return False
+
+    def get_summary_dict(self) -> Mapping[str, Any]:
+        """Return empty mapping, doesn't have any attributes."""
+        return dict()