Merge pull request #32 from rodrigo-arenas/0.6.Xdev

[PR] Code comments and BaseCallback
rodrigo-arenas · Jun 23, 2021 · 36953e8 · 36953e8
2 parents 2cccdc8 + 08596fe
commit 36953e8
Show file tree

Hide file tree

Showing 8 changed files with 93 additions and 38 deletions.
diff --git a/docs/tutorials/custom_callback.rst b/docs/tutorials/custom_callback.rst
@@ -76,6 +76,11 @@ that will have all this parameters, so we can rewrite it like this:
            return self.on_step(record, logbook, estimator)
 
 
+**Note:** The implementation of the ``__call__`` method is optional, by default
+its behavior is inherited from the :class:`~sklearn_genetic.callbacks.base.BaseCallback`.
+It's in this example for deeper understanding of how the callbacks are coded and
+to avoid unexpected overwrites.
+
 So that is it, now you can initialize the DummyThreshold
 and pass it to a in the ``fit`` method of a
 :class:`~sklearn_genetic.GASearchCV` instance:

diff --git a/sklearn_genetic/callbacks/base.py b/sklearn_genetic/callbacks/base.py
@@ -26,6 +26,5 @@ def on_step(self, record=None, logbook=None, estimator=None):
 
         pass  # pragma: no cover
 
-    @abstractmethod
     def __call__(self, record=None, logbook=None, estimator=None):
-        pass  # pragma: no cover
+        return self.on_step(record, logbook, estimator)
diff --git a/sklearn_genetic/callbacks/early_stoppers.py b/sklearn_genetic/callbacks/early_stoppers.py
@@ -43,9 +43,6 @@ def on_step(self, record=None, logbook=None, estimator=None):
                 "At least one of record or logbook parameters must be provided"
             )
 
-    def __call__(self, record=None, logbook=None, estimator=None):
-        return self.on_step(record, logbook, estimator)
-
 
 class ConsecutiveStopping(BaseCallback):
     """
@@ -133,9 +130,6 @@ def on_step(self, record=None, logbook=None, estimator=None):
         else:
             raise ValueError("logbook parameter must be provided")
 
-    def __call__(self, record=None, logbook=None, estimator=None):
-        return self.on_step(record, logbook, estimator)
-
 
 class TimerStopping(BaseCallback):
     """
@@ -161,6 +155,3 @@ def on_step(self, record=None, logbook=None, estimator=None):
             print(f"INFO: {self.__class__.__name__} callback met its criteria")
             return True
         return False
-
-    def __call__(self, record=None, logbook=None, estimator=None):
-        return self.on_step(record, logbook, estimator)
diff --git a/sklearn_genetic/callbacks/tests/test_callbacks.py b/sklearn_genetic/callbacks/tests/test_callbacks.py
@@ -60,36 +60,20 @@ def test_check_callback():
 
 
 def test_wrong_base_callback():
-    class MyDummyCallback(BaseCallback):
-        def __init__(self, metric):
-            self.metric = metric
-
-        def validate(self):
-            print(self.metric)
-
-    with pytest.raises(Exception) as excinfo:
-        callback = MyDummyCallback()
-    assert (
-        str(excinfo.value)
-        == "Can't instantiate abstract class MyDummyCallback with abstract methods __call__, on_step"
-    )
-
-
-def test_base_callback_call():
     possible_messages = [
-        "Can't instantiate abstract class MyDummyCallback with abstract methods __call__",
-        "Can't instantiate abstract class MyDummyCallback with abstract method __call__",
+        "Can't instantiate abstract class MyDummyCallback with abstract methods on_step",
+        "Can't instantiate abstract class MyDummyCallback with abstract method on_step",
     ]
 
     class MyDummyCallback(BaseCallback):
         def __init__(self, metric):
             self.metric = metric
 
-        def on_step(self, record=None, logbook=None, estimator=None):
-            print(record)
+        def validate(self):
+            print(self.metric)
 
     with pytest.raises(Exception) as excinfo:
-        callback = MyDummyCallback(metric="fitness")
+        MyDummyCallback()
 
     assert any([str(excinfo.value) == i for i in possible_messages])
 

diff --git a/sklearn_genetic/genetic_search.py b/sklearn_genetic/genetic_search.py
@@ -215,6 +215,7 @@ def __init__(
         self.log_config = log_config
         self._initial_training_time = None
 
+        # Check that the estimator is compatible with scikit-learn
         if not is_classifier(self.estimator) and not is_regressor(self.estimator):
             raise ValueError(
                 f"{self.estimator} is not a valid Sklearn classifier or regressor"
@@ -224,11 +225,13 @@ def __init__(
             raise ValueError(
                 f"Criteria must be one of {Criteria.list()}, got {criteria} instead"
             )
+        # Minimization is handle like an optimization problem with a change in the score sign
         elif criteria == Criteria.max.value:
             self.criteria_sign = 1
         elif criteria == Criteria.min.value:
             self.criteria_sign = -1
 
+        # Saves the param_grid and computes some extra properties in the same object
         self.space = Space(param_grid)
 
         super(GASearchCV, self).__init__(
@@ -243,12 +246,17 @@ def __init__(
         )
 
     def _register(self):
+        """
+        This function is the responsible for registering the DEAPs necessary methods
+        and create other objects to hold the hof, logbook and stats.
+        """
 
         self.creator.create("FitnessMax", base.Fitness, weights=[1.0])
         self.creator.create("Individual", list, fitness=creator.FitnessMax)
 
         attributes = []
-
+        # Assign all the parameters defined in the param_grid
+        # It uses the distribution parameter to set the sampling function
         for parameter, dimension in self.space.param_grid.items():
             self.toolbox.register(f"{parameter}", dimension.sample)
             attributes.append(getattr(self.toolbox, parameter))
@@ -290,22 +298,52 @@ def _register(self):
         self.logbook = tools.Logbook()
 
     def mutate(self, individual):
+        """
+        This function is responsible of changed a randomly selected parameter from an individual
+        Parameters
+        ----------
+        individual: Individual object
+            The individual (set of hyperparameters) that is being generated
 
+        Returns
+        -------
+            Mutated individual
+        """
+
+        # Randomly select one of the hyperparameters
         gen = random.randrange(0, len(self.space))
         parameter_idx = self.space.parameters[gen]
         parameter = self.space[parameter_idx]
 
+        # Using the defined distribution from the para_grid value
+        # Make a random sample of the parameter
         individual[gen] = parameter.sample()
 
         return [individual]
 
     def evaluate(self, individual):
+        """
+        Compute the cross-validation scores and record the logbook and mlflow (if specified)
+        Parameters
+        ----------
+        individual: Individual object
+            The individual (set of hyperparameters) that is being evaluated
+
+        Returns
+        -------
+            The fitness value of the estimator candidate, corresponding to the cv-score with the criteria sing
+
+        """
+
+        # Dictionary representation of the individual with key-> hyperparameter name, value -> value
         current_generation_params = {
             key: individual[n] for n, key in enumerate(self.space.parameters)
         }
 
         local_estimator = clone(self.estimator)
         local_estimator.set_params(**current_generation_params)
+
+        # Compute the cv-score
         cv_scores = cross_val_score(
             local_estimator,
             self.X_,
@@ -319,6 +357,7 @@ def evaluate(self, individual):
 
         score = np.mean(cv_scores)
 
+        # Uses the log config to save in remote log server (e.g MLflow)
         if self.log_config is not None:
             self.log_config.create_run(
                 parameters=current_generation_params,
@@ -328,6 +367,7 @@ def evaluate(self, individual):
 
         current_generation_params["score"] = score
 
+        # Log the hyperparameters and the cv-score
         self.logbook.record(parameters=current_generation_params)
 
         return [self.criteria_sign * score]
@@ -353,24 +393,29 @@ def fit(self, X, y, callbacks=None):
             The callback is evaluated after fitting the estimators from the generation 1.
 
         """
-        scorer = check_scoring(self.estimator, scoring=self.scoring)
 
         self.X_ = X
         self.y_ = y
+
+        # Make sure the callbacks are valid
         self.callbacks = check_callback(callbacks)
-        self.scorer_ = check_scoring(
-            self.estimator, scoring=self.scoring)
+        self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
 
+        # Set the DEAPs necessary methods
         self._register()
 
         self._initial_training_time = datetime.utcnow()
 
+        # Optimization routine from the selected evolutionary algorithm
         pop, log, n_gen = self._select_algorithm(
             pop=self._pop, stats=self._stats, hof=self._hof
         )
 
+        # Update the _n_iterations value as the algorithm could stop earlier due a callback
         self._n_iterations = n_gen
 
+        # hof keeps the best params according to the fitness value
+        # The best one is in the position 0
         self.best_params_ = {
             key: self._hof[0][n] for n, key in enumerate(self.space.parameters)
         }
@@ -388,6 +433,7 @@ def fit(self, X, y, callbacks=None):
             "fitness_min": log.select("fitness_min"),
         }
 
+        # Imitate the logic of scikit-learn refit parameter
         if self.refit:
             self.estimator.set_params(**self.best_params_)
             self.estimator.fit(self.X_, self.y_)
@@ -399,6 +445,26 @@ def fit(self, X, y, callbacks=None):
         return self
 
     def _select_algorithm(self, pop, stats, hof):
+        """
+        It selects the algorithm to run from the sklearn_genetic.algorithms module
+        based in the parameter self.algorithm.
+
+        Parameters
+        ----------
+        pop: pop object from DEAP
+        stats: stats object from DEAP
+        hof: hof object from DEAP
+
+        Returns
+        -------
+            pop: pop object
+                The last evaluated population
+             log: Logbook object
+                It contains the calculated metrics {'fitness', 'fitness_std', 'fitness_max', 'fitness_min'}
+                the number of generations and the number of evaluated individuals per generation
+            n_gen: int
+                The number of generations that the evolutionary algorithm ran
+        """
 
         if self.algorithm == Algorithms.eaSimple.value:
 

diff --git a/sklearn_genetic/space/space.py b/sklearn_genetic/space/space.py
@@ -171,6 +171,7 @@ def check_space(param_grid: dict = None):
     if not param_grid:
         raise ValueError(f"param_grid can not be empty")
 
+    # Make sure that each of the param_grid values are defined using one of the available Space objects
     for key, value in param_grid.items():
         if not isinstance(value, BaseDimension):
             raise ValueError(

diff --git a/sklearn_genetic/space/space_parameters.py b/sklearn_genetic/space/space_parameters.py
@@ -1,5 +1,10 @@
 import enum
 
+"""
+This module contains all the possible random distributions names
+that can be set in each of the Space variables
+"""
+
 
 class ExtendedEnum(enum.Enum):
     @classmethod

diff --git a/sklearn_genetic/tests/test_genetic_search.py b/sklearn_genetic/tests/test_genetic_search.py
@@ -62,7 +62,9 @@ def test_expected_ga_results():
     assert len(evolved_estimator.decision_function(X_test)) == len(X_test)
     assert len(evolved_estimator.predict_proba(X_test)) == len(X_test)
     assert len(evolved_estimator.predict_log_proba(X_test)) == len(X_test)
-    assert evolved_estimator.score(X_test, y_test) == accuracy_score(y_test, evolved_estimator.predict(X_test))
+    assert evolved_estimator.score(X_test, y_test) == accuracy_score(
+        y_test, evolved_estimator.predict(X_test)
+    )
     assert bool(evolved_estimator.get_params())
     assert len(evolved_estimator.hof) == evolved_estimator.keep_top_k
     assert "gen" in evolved_estimator[0]
@@ -148,7 +150,9 @@ def test_expected_algorithms_callbacks(algorithm, callback):
     assert len(evolved_estimator.decision_function(X_test)) == len(X_test)
     assert len(evolved_estimator.predict_proba(X_test)) == len(X_test)
     assert len(evolved_estimator.predict_log_proba(X_test)) == len(X_test)
-    assert evolved_estimator.score(X_test, y_test) == accuracy_score(y_test, evolved_estimator.predict(X_test))
+    assert evolved_estimator.score(X_test, y_test) == accuracy_score(
+        y_test, evolved_estimator.predict(X_test)
+    )
     assert bool(evolved_estimator.get_params())
     assert len(evolved_estimator.hof) <= evolved_estimator.keep_top_k
     assert "gen" in evolved_estimator[0]