Merge pull request #98 from rodrigo-arenas/0.9.0dev

Remove plot_parallel_coordinates
rodrigo-arenas · Jun 6, 2022 · f409aae · f409aae
2 parents 89a09ab + cf1c9a8
commit f409aae
Show file tree

Hide file tree

Showing 9 changed files with 111 additions and 107 deletions.
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,11 +1,11 @@
 scikit-learn>=0.21.3
+deap>=1.3.1
+numpy>=1.14.5
 pytest==6.2.2
 codecov==2.1.11
 pytest-cov==2.11.1
 twine==3.3.0
-numpy>=1.13.3
 seaborn>=0.11.1
-deap>=1.3.1
 mlflow>=1.17.0
 black==21.5b2
 sphinx

diff --git a/docs/api/plots.rst b/docs/api/plots.rst
@@ -7,7 +7,6 @@ Plots
 .. autosummary::
    plot_fitness_evolution
    plot_search_space
-   plot_parallel_coordinates
 
 .. automodule:: sklearn_genetic.plots
    :members:
diff --git a/docs/index.rst b/docs/index.rst
@@ -69,6 +69,7 @@ as it is usually advised to look further which distribution works better for you
    tutorials/adapters
    tutorials/understand_cv
    tutorials/mlflow
+   tutorials/reproducibility
 
 .. toctree::
    :maxdepth: 2

diff --git a/docs/release_notes.rst b/docs/release_notes.rst
@@ -18,18 +18,32 @@ Features:
   - :class:`~sklearn_genetic.schedules.InverseAdapter`
   - :class:`~sklearn_genetic.schedules.PotentialAdapter`
 
-* Changed the default values of `mutation_probability` and `crossover_probability`
-  to 0.8 and 0.2, respectively.
-
-* The `weighted_choice` function used in :class:`~sklearn_genetic.GAFeatureSelectionCV` was
-  re-written to give more probability to a number of features closer to the `max_features` parameter
 
 * Add `random_state` parameter (default= ``None``) in :class:`~sklearn_genetic.space.Continuous`,
   :class:`~sklearn_genetic.space.Categorical` and :class:`~sklearn_genetic.space.Integer` classes
   to leave fixed the random seed during hyperparameters sampling.
   Take into account that this only ensures that the space components are reproducible, not all the package.
   This is due to the DEAP dependency, which doesn't seem to have a native way to set the random seed.
 
+^^^^^^^^^^^^
+API Changes:
+^^^^^^^^^^^^
+
+* Changed the default values of `mutation_probability` and `crossover_probability`
+  to 0.8 and 0.2, respectively.
+
+* The `weighted_choice` function used in :class:`~sklearn_genetic.GAFeatureSelectionCV` was
+  re-written to give more probability to a number of features closer to the `max_features` parameter
+
+* Removed unused and wrong function :func:`~sklearn_genetic.plots.plot_parallel_coordinates`
+
+^^^^^^^^^^
+Bug Fixes:
+^^^^^^^^^^
+
+* Now when using the :func:`~sklearn_genetic.plots.plot_search_space` function, all the parameters get casted
+  as np.float64 to avoid errors on seaborn package while plotting bool values.
+
 What's new in 0.8.1
 -------------------
 

diff --git a/docs/tutorials/adapters.rst b/docs/tutorials/adapters.rst
@@ -34,7 +34,7 @@ value at generation t :math:`p(t; \alpha)`
 
 Note that :math:`p_0` doesn't need to be greater than :math:`p_f`.
 
-If :math:`p_0 > p_f`, you are performing a decay towards :math:`p_0`.
+If :math:`p_0 > p_f`, you are performing a decay towards :math:`p_f`.
 
 If :math:`p_0 < p_f`, you are performing an ascend towards :math:`p_f`.
 

diff --git a/docs/tutorials/reproducibility.rst b/docs/tutorials/reproducibility.rst
@@ -0,0 +1,81 @@
+Reproducibility
+===============
+
+
+One of the desirable capabilities of a package that makes several "random" choices is to be able to reproduce the results.
+
+The usual strategy is to fix the random seed that starts generating the pseudo-random numbers.
+Unfortunately, the DEAP package, which is the main dependency for all the evolutionary algorithms,
+doesn't have an explicit parameter to fix this seed.
+
+However, there is a workaround that seems to work to reproduce these results; this is:
+
+* Set the random seed of `numpy` and `random` package, which are the underlying random numbers generators
+* Use the random_state parameter In each of the scikit-learn and sklearn-genetic-opt objects that support it
+
+In the following example, the random_state is set for the `train_test_split`, `cross-validation` generator,
+each of the hyperparameters in the `param_grid`, the `RandomForestClassifier`, and at the file level.
+
+Example:
+--------
+.. code:: python3
+
+   import numpy as np
+   import random
+   from sklearn_genetic import GASearchCV
+   from sklearn_genetic.space import Continuous, Categorical, Integer
+   from sklearn.ensemble import RandomForestClassifier
+   from sklearn.model_selection import train_test_split, StratifiedKFold
+   from sklearn.datasets import load_digits
+   from sklearn.metrics import accuracy_score
+
+
+   # Random Seed at file level
+   random_seed = 54
+
+   np.random.seed(random_seed)
+   random.seed(random_seed)
+
+
+   data = load_digits()
+   n_samples = len(data.images)
+   X = data.images.reshape((n_samples, -1))
+   y = data['target']
+   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_seed)
+
+   clf = RandomForestClassifier(random_state=random_seed)
+
+   param_grid = {'min_weight_fraction_leaf': Continuous(0.01, 0.5, distribution='log-uniform',
+                                                        random_state=random_seed),
+                 'bootstrap': Categorical([True, False], random_state=random_seed),
+                 'max_depth': Integer(2, 30, random_state=random_seed),
+                 'max_leaf_nodes': Integer(2, 35, random_state=random_seed),
+                 'n_estimators': Integer(100, 300, random_state=random_seed)}
+
+   cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_seed)
+
+   evolved_estimator = GASearchCV(estimator=clf,
+                                  cv=cv,
+                                  scoring='accuracy',
+                                  population_size=8,
+                                  generations=5,
+                                  param_grid=param_grid,
+                                  n_jobs=-1,
+                                  verbose=True,
+                                  keep_top_k=4)
+
+   # Train and optimize the estimator
+   evolved_estimator.fit(X_train, y_train)
+   # Best parameters found
+   print(evolved_estimator.best_params_)
+   # Use the model fitted with the best parameters
+   y_predict_ga = evolved_estimator.predict(X_test)
+   print(accuracy_score(y_test, y_predict_ga))
+
+   # Saved metadata for further analysis
+   print("Stats achieved in each generation: ", evolved_estimator.history)
+   print("Best k solutions: ", evolved_estimator.hof)
+
+
+
+
diff --git a/sklearn_genetic/plots.py b/sklearn_genetic/plots.py
@@ -9,12 +9,12 @@
     logger.error(
         "seaborn not found, pip install seaborn to use plots functions"
     )  # noqa
-import pandas as pd
+
 import numpy as np
 
 from .utils import logbook_to_pandas
 from .parameters import Metrics
-from .space import Categorical
+
 from .genetic_search import GAFeatureSelectionCV
 
 """
@@ -86,88 +86,17 @@ def plot_search_space(estimator, height=2, s=25, features: list = None):
 
     df = logbook_to_pandas(estimator.logbook)
     if features:
-        stats = df[features]
+        stats = df[features].astype(np.float64)
     else:
         variables = [*estimator.space.parameters, estimator.refit_metric]
-        stats = df[variables]
+        stats = df[variables].astype(np.float64)
 
     g = sns.PairGrid(stats, diag_sharey=False, height=height)
     g = g.map_upper(sns.scatterplot, s=s, color="r", alpha=0.2)
     g = g.map_lower(
         sns.kdeplot,
         shade=True,
-        cmap=sns.color_palette("ch:s=.25,rot=-.25", as_cmap=True),
+        cmap=sns.color_palette("ch:s=.25,rot=-.25", as_cmap=True)
     )
     g = g.map_diag(sns.kdeplot, shade=True, palette="crest", alpha=0.2, color="red")
     return g
-
-
-def noise(score):
-    """
-    Parameters
-    ----------
-    score: Series
-        The `score` column from the logbook data of :class:`~sklearn_genetic.GASearchCV`
-
-    Returns
-    -------
-    Noise to be added to each element of the score to avoid non-unique bin edges
-
-    """
-    score_len = len(score)
-    score_std = score.std()
-    noise_ratio = 1e7
-    noise = (np.random.random(score_len) * score_std / noise_ratio) - (
-        score_std / 2 * noise_ratio
-    )
-    return noise
-
-
-def plot_parallel_coordinates(estimator, features: list = None):
-    """
-    Parameters
-    ----------
-    estimator: estimator object
-        A fitted estimator from :class:`~sklearn_genetic.GASearchCV`
-    features: list, default=None
-        Subset of features to plot, if ``None`` it plots all the features by default
-
-    Returns
-    -------
-    Parallel Coordinates plot of the non-categorical values
-
-    """
-
-    if isinstance(estimator, GAFeatureSelectionCV):
-        raise TypeError(
-            "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
-        )
-
-    df = logbook_to_pandas(estimator.logbook)
-    param_grid = estimator.space.param_grid
-    score = df[estimator.refit_metric]
-    if features:
-        non_categorical_features = []
-        for feature in features:
-            if not isinstance(param_grid[feature], Categorical):
-                non_categorical_features.append(feature)
-            else:
-                logger.warning(
-                    "`%s` is Categorical variable! It was dropped from the plot feature list",
-                    feature,
-                )
-        stats = df[non_categorical_features]
-    else:
-        non_categorical_variables = []
-        for variable, var_type in param_grid.items():
-            if not isinstance(var_type, Categorical):
-                non_categorical_variables.append(variable)
-        non_categorical_variables.append("score")
-        stats = df[non_categorical_variables]
-
-    stats["score_quartile"] = pd.qcut(score + noise(score), 4, labels=[1, 2, 3, 4])
-    g = pd.plotting.parallel_coordinates(
-        stats, "score_quartile", color=("#8E8E8D", "#4ECDC4", "#C7F464", "#FF0000")
-    )
-
-    return g
diff --git a/sklearn_genetic/schedules/schedulers.py b/sklearn_genetic/schedules/schedulers.py
@@ -71,7 +71,7 @@ def step(self):
 
 class InverseAdapter(BaseAdapter):
     """
-    Adapts the initial value towards the end value using a "decay" function of the for 1/x
+    Adapts the initial value towards the end value using a "decay" function of the form 1/x
 
     Parameters
     ----------

diff --git a/sklearn_genetic/tests/test_plots.py b/sklearn_genetic/tests/test_plots.py
@@ -4,7 +4,7 @@
 from sklearn.tree import DecisionTreeRegressor
 
 from .. import GASearchCV, GAFeatureSelectionCV
-from ..plots import plot_fitness_evolution, plot_search_space, plot_parallel_coordinates
+from ..plots import plot_fitness_evolution, plot_search_space
 from ..space import Integer, Categorical, Continuous
 
 data = load_boston()
@@ -55,20 +55,11 @@ def test_plot_evolution():
 
 
 def test_plot_space():
-    plot = plot_search_space(evolved_estimator)
-    plot = plot_search_space(evolved_estimator)
     plot = plot_search_space(
         evolved_estimator, features=["ccp_alpha", "max_depth", "min_samples_split"]
     )
 
 
-def test_plot_parallel():
-    plot = plot_parallel_coordinates(evolved_estimator)
-    plot = plot_parallel_coordinates(
-        evolved_estimator, features=["ccp_alpha", "criterion"]
-    )
-
-
 def test_wrong_estimator_space():
     estimator = GAFeatureSelectionCV(clf, cv=3, scoring="accuracy", population_size=6)
     with pytest.raises(Exception) as excinfo:
@@ -78,14 +69,3 @@ def test_wrong_estimator_space():
         str(excinfo.value)
         == "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
     )
-
-
-def test_wrong_estimator_parallel():
-    estimator = GAFeatureSelectionCV(clf, cv=3, scoring="accuracy", population_size=6)
-    with pytest.raises(Exception) as excinfo:
-        plot = plot_parallel_coordinates(estimator)
-
-    assert (
-        str(excinfo.value)
-        == "Estimator must be a GASearchCV instance, not a GAFeatureSelectionCV instance"
-    )