pykeen · lvermue · Dec 7, 2020 · Nov 1, 2020 · Nov 1, 2020 · Nov 1, 2020
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -11,6 +11,7 @@ PyKEEN
    tutorial/first_steps
    tutorial/understanding_evaluation
    tutorial/translational_toy_example
+   tutorial/checkpoints
    tutorial/running_hpo
    tutorial/running_ablation
    tutorial/byod

diff --git a/docs/source/tutorial/checkpoints.rst b/docs/source/tutorial/checkpoints.rst
@@ -0,0 +1,147 @@
+Using Checkpoints
+=================
+Training may take days to weeks in extreme cases when using models with many parameters or big datasets. This introduces
+a large array of possible errors, e.g. session timeouts, server restarts etc., which would lead to a complete loss of
+all progress made so far. To avoid this the :class:`pykeen.training.TrainingLoop` supports built-in check-points that
+allow a straight-forward saving of the current training loop state and resumption of a saved
+state from saved checkpoints.
+
+How to do it
+------------
+To show how checkpoints are used with PyKEEN let's look at a simple example of how a model is setup.
+For fixing possible errors and safety fallbacks please also look at :ref:`word_of_caution`.
+
+.. code-block:: python
+
+    from pykeen.models import TransE
+    from pykeen.training import SLCWATrainingLoop
+    from pykeen.triples import TriplesFactory
+    from torch.optim import Adam
+
+    triples_factory = Nations().training
+    model = TransE(
+        triples_factory=triples_factory,
+        random_seed=123,
+    )
+
+    optimizer = Adam(params=model.get_grad_params())
+    training_loop = SLCWATrainingLoop(model=model, optimizer=optimizer)
+
+At this point we have a model, dataset and optimizer all setup in a training loop and are ready to train the model with
+the ``training_loop``'s method :func:`pykeen.training.TrainingLoop.train`. To enable checkpoints all you have to do is
+setting the function argument ``checkpoint_file`` to the name you would like it to have.
+Optionally, you can set the path to where you want the checkpoints to be saved by setting the ``checkpoint_directory``
+argument with a string or a :class:`pathlib.Path` object containing your desired root path. If you didn't set the
+``checkpoint_directory`` argument, your checkpoints will be saved in the ``PYKEEN_HOME`` directory that is defined in
+:mod:`pykeen.constants`, which is a subdirectory in your home directory, e.g. ``~/.pykeen/checkpoints``.
+Furthermore, you can set the checkpoint frequency, i.e. how often checkpoints should be saved given in minutes, by
+setting the argument ``checkpoint_frequency`` with an integer. The default frequency is 30 minutes and setting it to
+``0`` will cause the training loop to save a checkpoint after each epoch.
+
+Here is an example:
+
+.. code-block:: python
+
+    losses = training_loop.train(
+        num_epochs=1000,
+        checkpoint_name='my_checkpoint.pt',
+        checkpoint_frequency=5,
+    )
+
+With this code we have started the training loop with the above defined KGEM. The training loop will save a checkpoint
+in the ``my_checkpoint.pt`` file, which will be saved in the ``~/.pykeen/checkpoints/`` directory, since we haven't
+set the argument ``checkpoint_directory``.
+The checkpoint file will be saved after 5 minutes since starting the training loop or the last time a checkpoint was
+saved and the epoch finishes, i.e. when one epoch takes 10 minutes the checkpoint will be saved after 10 minutes.
+In addition, checkpoints are always saved when the early stopper stops the training loop or the last epoch was finished.
+
+Let's assume you were anticipative, saved checkpoints and your training loop crashed after 200 epochs.
+Now you would like to resume from the last checkpoint. All you have to do is to rerun the **exact same code** as above
+and PyKEEN will smoothly start from the given checkpoint. Since PyKEEN stores all random states as well as the
+states of the model, optimizer and early stopper, the results will be exactly the same compared to running the
+training loop uninterruptedly. Of course, PyKEEN will also continue saving new checkpoints even when
+resuming from a previous checkpoint.
+
+On top of resuming interrupted training loops you can also resume training loops that finished successfully.
+E.g. the above training loop finished successfully after 1000 epochs, but you would like to
+train the same model from that state for 2000 epochs. All you have have to do is to change the argument
+``num_epochs`` in the above code to:
+
+.. code-block:: python
+
+    losses = training_loop.train(
+        num_epochs=2000,
+        checkpoint_name='my_checkpoint.pt',
+        checkpoint_frequency=5,
+    )
+
+and now the training loop will resume from the state at 1000 epochs and continue to train until 2000 epochs.
+
+Another nice feature is that the checkpoints functionality integrates with the pipeline. This means that you can simply
+define a pipeline like this:
+
+.. code-block:: python
+
+    from pykeen.pipeline import pipeline
+    pipeline_result = pipeline(
+        dataset='Nations',
+        model='TransE',
+        optimizer='Adam',
+        training_kwargs=dict(num_epochs=1000, checkpoint_name='my_checkpoint.pt', checkpoint_frequency=5),
+    )
+
+Again, assuming that e.g. this pipeline crashes after 200 epochs, you can simply execute **the same code** and the
+pipeline will load the last state from the checkpoint file and continue training as if nothing happened.
+
+.. todo:: Tutorial on recovery from hpo_pipeline.
+
+Checkpoints on Failure
+----------------------
+In cases where you only would like to save checkpoints whenever the training loop might fail, you can use the argument
+``checkpoint_on_failure=True``, like:
+
+.. code-block:: python
+
+    losses = training_loop.train(
+        num_epochs=2000,
+        checkpoint_on_failure=True,
+    )
+
+This option differs from ordinary checkpoints, since ordinary checkpoints are only saved
+after a successful epoch. When saving checkpoints due to failure of the training loop there is no guarantee that all
+random states can be recovered correctly, which might cause problems with regards to the reproducibility of that
+specific training loop. Therefore, these checkpoints are saved with a distinct checkpoint name, which will be
+``PyKEEN_just_saved_my_day_{datetime}.pt`` in the given ``checkpoint_directory``, even when you also opted to use
+ordinary checkpoints as defined above, e.g. with this code:
+
+.. code-block:: python
+
+    losses = training_loop.train(
+        num_epochs=2000,
+        checkpoint_name='my_checkpoint.pt',
+        checkpoint_frequency=5,
+        checkpoint_on_failure=True,
+    )
+
+Note: Use this argument with caution, since every failed training loop will create a distinct checkpoint file.
+
+.. _word_of_caution:
+
+Word of Caution and Possible Errors
+-----------------------------------
+When using checkpoints and trying out several configurations, which in return result in multiple different checkpoints,
+the inherent risk of overwriting checkpoints arises. This would naturally happen when you change the configuration of
+the KGEM, but don't change the ``checkpoint_name`` argument.
+To prevent this from happening, PyKEEN makes a hash-sum comparison of the configurations of the checkpoint and
+the one of the current configuration at hand. When these don't match, PyKEEN won't accept the checkpoint and raise
+an error.
+
+In case you want to overwrite the previous checkpoint file with a new configuration, you have to delete it explicitly.
+The reason for this behavior is three-fold:
+
+1. This allows a very easy and user friendly way of resuming an interrupted training loop by simply re-running
+   the exact same code.
+2. By explicitly requiring to name the checkpoint files the user controls the naming of the files and thus makes
+   it easier to keep an overview.
+3. Creating new checkpoint files for each run will lead most users to inadvertently spam their file systems with
+   unused checkpoints that with ease can add up to hundred of GBs when running many experiments.
diff --git a/src/pykeen/constants.py b/src/pykeen/constants.py
@@ -3,9 +3,15 @@
 """Constants for PyKEEN."""
 
 import os
+import pathlib
 
 __all__ = [
     'PYKEEN_HOME',
+    'PYKEEN_DEFAULT_CHECKPOINT_DIR',
 ]
 
 PYKEEN_HOME = os.environ.get('PYKEEN_HOME') or os.path.join(os.path.expanduser('~'), '.pykeen')
+PYKEEN_DEFAULT_CHECKPOINT = "PyKEEN_just_saved_my_day.pt"
+
+PYKEEN_DEFAULT_CHECKPOINT_DIR = pathlib.Path(PYKEEN_HOME).joinpath("checkpoints")
+PYKEEN_DEFAULT_CHECKPOINT_DIR.mkdir(exist_ok=True, parents=True)
diff --git a/src/pykeen/models/base.py b/src/pykeen/models/base.py
@@ -260,8 +260,10 @@ def __init__(
         # Random seeds have to set before the embeddings are initialized
         if random_seed is None:
             logger.warning('No random seed is specified. This may lead to non-reproducible results.')
+            self._random_seed = None
         elif random_seed is not NoRandomSeedNecessary:
             set_random_seed(random_seed)
+            self._random_seed = random_seed
 
         # Loss
         if loss is None:

diff --git a/src/pykeen/pipeline.py b/src/pykeen/pipeline.py
@@ -168,6 +168,7 @@
 import json
 import logging
 import os
+import pathlib
 import time
 from dataclasses import dataclass, field
 from typing import Any, Collection, Dict, Iterable, List, Mapping, Optional, Set, Type, Union
@@ -176,6 +177,7 @@
 import torch
 from torch.optim.optimizer import Optimizer
 
+from .constants import PYKEEN_DEFAULT_CHECKPOINT_DIR
 from .datasets import get_dataset
 from .datasets.base import DataSet
 from .evaluation import Evaluator, MetricResults, get_evaluator_cls
@@ -824,7 +826,28 @@ def pipeline(  # noqa: C901
     :param use_testing_data:
         If true, use the testing triples. Otherwise, use the validation triples. Defaults to true - use testing triples.
     """
-    if random_seed is None:
+    if training_kwargs is None:
+        training_kwargs = {}
+
+    # To allow resuming training from a checkpoint when using a pipeline, the pipeline needs to obtain the
+    # used random_seed to ensure reproducible results
+    checkpoint_name = training_kwargs.get('checkpoint_name')
+    if checkpoint_name is not None:
+        checkpoint_directory = pathlib.Path(training_kwargs.get('checkpoint_directory', PYKEEN_DEFAULT_CHECKPOINT_DIR))
+        checkpoint_directory.mkdir(parents=True, exist_ok=True)
+        checkpoint_path = checkpoint_directory / checkpoint_name
+        if checkpoint_path.is_file():
+            checkpoint_dict = torch.load(checkpoint_path)
+            random_seed = checkpoint_dict['random_seed']
+            logger.info('loaded random seed %s from checkpoint.', random_seed)
+            # We have to set clear optimizer to False since training should be continued
+            clear_optimizer = False
+        else:
+            logger.info(f"=> no training loop checkpoint file found at '{checkpoint_path}'. Creating a new file.")
+            if random_seed is None:
+                random_seed = random_non_negative_int()
+                logger.warning(f'No random seed is specified. Setting to {random_seed}.')
+    elif random_seed is None:
         random_seed = random_non_negative_int()
         logger.warning(f'No random seed is specified. Setting to {random_seed}.')
     set_random_seed(random_seed)
@@ -947,9 +970,6 @@ def pipeline(  # noqa: C901
     if evaluation_kwargs is None:
         evaluation_kwargs = {}
 
-    if training_kwargs is None:
-        training_kwargs = {}
-
     # Stopping
     if 'stopper' in training_kwargs and stopper is not None:
         raise ValueError('Specified stopper in training_kwargs and as stopper')

diff --git a/src/pykeen/stoppers/early_stopping.py b/src/pykeen/stoppers/early_stopping.py
@@ -191,3 +191,26 @@ def get_summary_dict(self) -> Mapping[str, Any]:
             best_epoch=self.best_epoch,
             best_metric=self.best_metric,
         )
+
+    def _write_from_summary_dict(
+        self,
+        frequency: int,
+        patience: int,
+        relative_delta: float,
+        metric: str,
+        larger_is_better: bool,
+        results: List[float],
+        stopped: bool,
+        best_epoch: int,
+        best_metric: float,
+    ) -> None:
+        """Write attributes to stopper from a summary dict."""
+        self.frequency = frequency
+        self.patience = patience
+        self.relative_delta = relative_delta
+        self.metric = metric
+        self.larger_is_better = larger_is_better
+        self.results = results
+        self.stopped = stopped
+        self.best_epoch = best_epoch
+        self.best_metric = best_metric
diff --git a/src/pykeen/stoppers/stopper.py b/src/pykeen/stoppers/stopper.py
@@ -2,13 +2,20 @@
 
 """Basic stoppers."""
 
+import logging
+import pathlib
 from abc import ABC, abstractmethod
+from typing import Any, Mapping, Union
+
+import torch
 
 __all__ = [
     'Stopper',
     'NopStopper',
 ]
 
+logger = logging.getLogger(__name__)
+
 
 class Stopper(ABC):
     """A harness for stopping training."""
@@ -25,6 +32,29 @@ def should_stop(self, epoch: int) -> bool:
         """Validate on validation set and check for termination condition."""
         raise NotImplementedError
 
+    @abstractmethod
+    def get_summary_dict(self) -> Mapping[str, Any]:
+        """Get a summary dict."""
+        raise NotImplementedError
+
+    def _write_from_summary_dict(self, **kwargs):
+        pass
+
+    @staticmethod
+    def load_summary_dict_from_training_loop_checkpoint(path: Union[str, pathlib.Path]) -> Mapping[str, Any]:
+        """Load the summary dict from a training loop checkpoint.
+
+        :param path:
+            Path of the file where to store the state in.
+
+        :return:
+            The summary dict of the stopper at the time of saving the checkpoint.
+        """
+        logger.info(f"=> loading stopper summary dict from training loop checkpoint in '{path}'")
+        checkpoint = torch.load(path)
+        logger.info(f"=> loaded stopper summary dictionary from checkpoint in '{path}'")
+        return checkpoint['stopper_dict']
+
 
 class NopStopper(Stopper):
     """A stopper that does nothing."""
@@ -36,3 +66,7 @@ def should_evaluate(self, epoch: int) -> bool:
     def should_stop(self, epoch: int) -> bool:
         """Return false; should never stop."""
         return False
+
+    def get_summary_dict(self) -> Mapping[str, Any]:
+        """Return empty mapping, doesn't have any attributes."""
+        return dict()