Merge pull request #19 from rodrigo-arenas/0.5.xdev

Callbacks estimator and LogbookSaver
rodrigo-arenas · Jun 21, 2021 · 4e787d5 · 4e787d5
2 parents 8d643ec + d97ad6a
commit 4e787d5
Show file tree

Hide file tree

Showing 11 changed files with 191 additions and 32 deletions.
diff --git a/demo/Boson_Houses_decision_tree.py b/demo/Boson_Houses_decision_tree.py
@@ -2,6 +2,7 @@
 from sklearn_genetic import GASearchCV
 from sklearn_genetic.space import Integer, Categorical, Continuous
 from sklearn_genetic.plots import plot_fitness_evolution, plot_search_space
+from sklearn_genetic.callbacks import LogbookSaver
 from sklearn.datasets import load_boston
 from sklearn.model_selection import train_test_split, KFold
 from sklearn.tree import DecisionTreeRegressor
@@ -23,7 +24,7 @@
 
 clf = DecisionTreeRegressor()
 
-pipe = Pipeline([('scaler', StandardScaler()), ('clf', clf)])
+pipe = Pipeline([("scaler", StandardScaler()), ("clf", clf)])
 
 param_grid = {
     "clf__ccp_alpha": Continuous(0, 1),
@@ -49,15 +50,17 @@
     n_jobs=-1,
 )
 
-evolved_estimator.fit(X_train, y_train)
+callbacks = [LogbookSaver(checkpoint_path="./logbook.pkl")]
+
+evolved_estimator.fit(X_train, y_train, callbacks=callbacks)
 y_predict_ga = evolved_estimator.predict(X_test)
 r_squared = r2_score(y_test, y_predict_ga)
 
 print(evolved_estimator.best_params_)
 print("r-squared: ", "{:.2f}".format(r_squared))
 
 print("Best k solutions: ", evolved_estimator.hof)
-plot = plot_fitness_evolution(evolved_estimator, metric="fitness_sd")
+plot = plot_fitness_evolution(evolved_estimator, metric="fitness_std")
 plt.show()
 
 plot_search_space(evolved_estimator)

diff --git a/docs/api/callbacks.rst b/docs/api/callbacks.rst
@@ -12,3 +12,7 @@ Callbacks
 .. autoclass:: sklearn_genetic.callbacks.ThresholdStopping
    :members:
    :undoc-members: False
+
+.. autoclass:: sklearn_genetic.callbacks.LogbookSaver
+   :members:
+   :undoc-members: False
diff --git a/docs/tutorials/basic_usage.rst b/docs/tutorials/basic_usage.rst
@@ -26,7 +26,8 @@ It'll continue with this process until a number of generations is reached or unt
 Example
 -------
 
-First lets import some dataset and others scikit-learn standard modules, we'll use the `digits dataset <https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html>`__.
+First lets import some dataset and others scikit-learn standard modules, we'll use
+the `digits dataset <https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html>`__.
 This is a classification problem, we'll fine-tune a Random Forest Classifier for this task.
 
 .. code:: python3

diff --git a/docs/tutorials/callbacks.rst b/docs/tutorials/callbacks.rst
@@ -6,7 +6,7 @@ Introduction
 
 Callbacks can be defined to take decisions over the optimization
 process while it is still running.
-Common callbacks includes different rules to stop the algorithm.
+Common callbacks includes different rules to stop the algorithm or log artifacts.
 
 The callbacks are passed to the ``.fit`` method
 of the :class:`~sklearn_genetic.GASearchCV` class.
@@ -26,6 +26,8 @@ the data set and model used in :ref:`basic-usage`. The available callbacks are:
 
 * ThresholdStopping
 
+* LogbookSaver
+
 ConsecutiveStopping
 -------------------
 
@@ -65,7 +67,7 @@ using the 'fitness_min' value:
     from sklearn_genetic.callbacks import DeltaThreshold
     callback = DeltaThreshold(threshold=0.001, metric='fitness')
 
-    evolved_estimator.fit(X, y, callbacks=ConsecutiveStopping)
+    evolved_estimator.fit(X, y, callbacks=callback)
 
 
 ThresholdStopping
@@ -81,7 +83,29 @@ if the 'fitness_max' is above 0.98
     from sklearn_genetic.callbacks import ThresholdStopping
     callback = ThresholdStopping(threshold=0.98, metric='fitness_max')
 
-    evolved_estimator.fit(X, y, callbacks=ConsecutiveStopping)
+    evolved_estimator.fit(X, y, callbacks=callback)
+
+LogbookSaver
+------------
+It saves at each iteration the Logbook object with all the parameters and
+the cv score achieved by those parameters. It uses joblib.dump to save
+the file.
+
+.. code:: python3
+
+    from sklearn_genetic.callbacks import LogbookSaver
+    callback = LogbookSaver(checkpoint_path="./logbook.pkl")
+
+    evolved_estimator.fit(X, y, callbacks=callback)
+
+Then the object can be restored:
+
+.. code:: python3
+
+    from joblib import load
+
+    logbook = load("/.logbook.pkl")
+    print(logbook)
 
 Define Multiple Callbacks
 -------------------------

diff --git a/docs/tutorials/custom_callback.rst b/docs/tutorials/custom_callback.rst
@@ -13,10 +13,12 @@ In this example, we are going to define a dummy callback that
 stops the process if there have been more that `N` fitness values
 bellow a threshold value.
 
-The callback must have two parameters: `record` and `logbook`.
-Those are a dictionary and a deap's Logbook object respectively,
-with the current iteration metrics and all the past iterations metrics.
-You can choice which to use, but both must be parameters
+The callback must have three parameters: `record`, `logbook` and `estimator`.
+Those are a dictionary, a deap's Logbook object respectively, and the
+current :class:`~sklearn_genetic.GASearchCV`
+with the current iteration metrics, all the past iterations metrics
+and all the properties saved in the estimator.
+You can choice which to use, but all of them must be parameters
 on the ``on_step`` and ``__call__`` methods.
 
 So to check inside the logbook, we could define a function like this:
@@ -27,7 +29,7 @@ So to check inside the logbook, we could define a function like this:
     metric='fitness'
     threshold=0.8
 
-    def on_step(record, logbook, threshold):
+    def on_step(record, logbook, threshold, estimator=None):
         # Not enough data points
         if len(logbook) <= N:
             return False
@@ -53,7 +55,7 @@ that will have all this parameters, so we can rewrite it like this:
            self.N = N
            self.metric = metric
 
-       def on_step(self, record, logbook):
+       def on_step(self, record, logbook, estimator=None):
            # Not enough data points
            if len(logbook) <= self.N:
                return False
@@ -67,8 +69,8 @@ that will have all this parameters, so we can rewrite it like this:
 
            return False
 
-       def __call__(self, record, logbook):
-           return self.on_step(record, logbook)
+       def __call__(self, record, logbook, estimator=None):
+           return self.on_step(record, logbook, estimator)
 
 
 So that is it, now you can initialize the DummyThreshold

diff --git a/sklearn_genetic/algorithms.py b/sklearn_genetic/algorithms.py
@@ -17,6 +17,7 @@ def eaSimple(
     halloffame=None,
     callbacks=None,
     verbose=True,
+    estimator=None,
 ):
     """
     The base implementation is directly taken from: https://github.com/DEAP/deap/blob/master/deap/algorithms.py
@@ -51,6 +52,9 @@ def eaSimple(
     verbose: bool, default=True
         Whether or not to log the statistics.
 
+    estimator: :class:`~sklearn_genetic.GASearchCV`, default = None
+        Estimator that is being optimized
+
     Returns
     -------
 
@@ -109,7 +113,7 @@ def eaSimple(
             print(logbook.stream)
 
         # Check if any of the callbacks conditions are True to stop the iteration
-        if eval_callbacks(callbacks, record, logbook):
+        if eval_callbacks(callbacks, record, logbook, estimator):
             print("Process stopped earlier due a callback")
             break
 
@@ -130,6 +134,7 @@ def eaMuPlusLambda(
     halloffame=None,
     callbacks: Union[list, Callable] = None,
     verbose=True,
+    estimator=None,
 ):
     """
     The base implementation is directly taken from: https://github.com/DEAP/deap/blob/master/deap/algorithms.py
@@ -168,6 +173,9 @@ def eaMuPlusLambda(
     verbose: bool, default=True
         Whether or not to log the statistics.
 
+    estimator: :class:`~sklearn_genetic.GASearchCV`, default = None
+        Estimator that is being optimized
+
     Returns
     -------
 
@@ -222,7 +230,7 @@ def eaMuPlusLambda(
         if verbose:
             print(logbook.stream)
 
-        if eval_callbacks(callbacks, record, logbook):
+        if eval_callbacks(callbacks, record, logbook, estimator):
             print("Process stopped earlier due a callback")
             break
 
@@ -242,6 +250,7 @@ def eaMuCommaLambda(
     halloffame=None,
     callbacks: Union[list, Callable] = None,
     verbose=True,
+    estimator=None,
 ):
     """
     The base implementation is directly taken from: https://github.com/DEAP/deap/blob/master/deap/algorithms.py
@@ -281,6 +290,9 @@ def eaMuCommaLambda(
     verbose: bool, default=True
         Whether or not to log the statistics.
 
+    estimator: :class:`~sklearn_genetic.GASearchCV`, default = None
+        Estimator that is being optimized
+
     Returns
     -------
 
@@ -338,7 +350,7 @@ def eaMuCommaLambda(
             print(logbook.stream)
 
         # Check if any of the callbacks conditions are True to stop the iteration
-        if eval_callbacks(callbacks, record, logbook):
+        if eval_callbacks(callbacks, record, logbook, estimator):
             print("Process stopped earlier due a callback")
             break