Skip to content

Commit

Permalink
Merge pull request #22 from rodrigo-arenas/0.5.xdev
Browse files Browse the repository at this point in the history
MLflow integration
  • Loading branch information
rodrigo-arenas committed Jun 21, 2021
2 parents f4f786b + 839334f commit 2c47bee
Show file tree
Hide file tree
Showing 22 changed files with 452 additions and 125 deletions.
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Main Features:
* **Algorithms**: Set of different evolutionary algorithms to use as optimization procedure
* **Callbacks**: Custom evaluation strategies to generate Early Stopping rules
* **Plots**: Generate pre-define plots to understand the optimization process
* **MLflow**: Build-in integration with mlflow to log all the hyperparameters and their cv-score

Usage:
######
Expand Down
14 changes: 8 additions & 6 deletions demo/Demo_Digits_Dataset.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
}
],
"source": [
"from sklearn_genetic_opt import GASearchCV\n",
"from sklearn_genetic import GASearchCV\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV\n",
"from sklearn_genetic.space import Categorical, Continuous\n",
"import scipy.stats as stats\n",
"from sklearn.utils.fixes import loguniform\n",
"from sklearn.datasets import load_digits\n",
"from sklearn.metrics import accuracy_score\n",
"import numpy as np\n",
"import itertools\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")"
]
Expand Down Expand Up @@ -266,16 +266,18 @@
"metadata": {},
"outputs": [],
"source": [
"param_grid = {'l1_ratio': Continuous(0,1),\n",
" 'alpha': Continuous(1e-4,1),\n",
" 'average': Categorical([True, False])}\n",
"\n",
"evolved_estimator = GASearchCV(clf,\n",
" cv=3,\n",
" scoring='accuracy',\n",
" param_grid=param_grid,\n",
" population_size=10,\n",
" generations=8,\n",
" tournament_size=3,\n",
" elitism=True,\n",
" continuous_parameters = {'l1_ratio':(0,1), 'alpha':(1e-4,1)},\n",
" categorical_parameters = {'average': [True, False]},\n",
" integer_parameters = {},\n",
" verbose=True)"
]
},
Expand Down Expand Up @@ -393,4 +395,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}
62 changes: 62 additions & 0 deletions demo/mlflow_logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import warnings
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Categorical, Integer, Continuous
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn_genetic.mlflow import MLflowConfig

warnings.filterwarnings("ignore")

data = load_digits()
label_names = data["target_names"]
y = data["target"]
X = data["data"]

X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42
)

clf = DecisionTreeClassifier()

params_grid = {
"min_weight_fraction_leaf": Continuous(0, 0.5),
"criterion": Categorical(["gini", "entropy"]),
"max_depth": Integer(2, 20),
"max_leaf_nodes": Integer(2, 30),
}

cv = StratifiedKFold(n_splits=3, shuffle=True)

mlflow_config = MLflowConfig(
tracking_uri="http://localhost:5000",
experiment="Digits-sklearn-genetic-opt",
run_name="Decision Tree",
save_models=True,
tags={"team": "sklearn-genetic-opt", "version": "0.5.0"},
)

evolved_estimator = GASearchCV(
clf,
cv=cv,
scoring="accuracy",
population_size=4,
generations=10,
tournament_size=3,
elitism=True,
crossover_probability=0.9,
mutation_probability=0.05,
param_grid=params_grid,
algorithm="eaMuPlusLambda",
n_jobs=-1,
verbose=True,
log_config=mlflow_config,
)

evolved_estimator.fit(X_train, y_train)
y_predict_ga = evolved_estimator.predict(X_test)
accuracy = accuracy_score(y_test, y_predict_ga)

print(evolved_estimator.best_params_)
print("accuracy score: ", "{:.2f}".format(accuracy))
1 change: 1 addition & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ numpy>=1.13.3
seaborn>=0.11.1
deap>=1.3.1
pydantic>=1.8.2
mlflow==1.17.0
black==21.5b2
sphinx
sphinx_gallery
Expand Down
6 changes: 6 additions & 0 deletions docs/api/mlflow.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
MLflow
------

.. autoclass:: sklearn_genetic.MLflowConfig
:members:
:undoc-members: False
Binary file added docs/images/mlflow_artifacts_4.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/mlflow_children_2.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/mlflow_experiment_0.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/images/mlflow_nested_run_1.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 7 additions & 2 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ It's advised to install sklearn-genetic using a virtual env, inside the env use:
.. |SeabornMinVersion| replace:: 0.9.0
.. |DEAPMinVersion| replace:: 1.3.1
.. |PydanticMinVersion| replace:: 1.8.2
.. |MLflowMinVersion| replace:: 1.17.0

sklearn-genetic-opt requires:

Expand All @@ -37,6 +38,7 @@ sklearn-genetic-opt requires:
- Seaborn (>= |SeabornMinVersion|)
- DEAP (>= |DEAPMinVersion|)
- Pydantic (>= |PydanticMinVersion|)
- MLflow (>= |MLflowMinVersion|)

.. toctree::
:maxdepth: 2
Expand All @@ -46,6 +48,7 @@ sklearn-genetic-opt requires:
tutorials/callbacks
tutorials/custom_callback
tutorials/understand_cv
tutorials/mlflow
release_notes

.. toctree::
Expand All @@ -54,12 +57,14 @@ sklearn-genetic-opt requires:

api/gasearchcv
api/callbacks
api/plots
api/mlflow
api/space
api/algorithms
api/plots


.. toctree::
:maxdepth: 2
:maxdepth: 1
:caption: External References:

external_references
Expand Down
14 changes: 12 additions & 2 deletions docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,23 @@ What's new in 0.5.0
Features:
^^^^^^^^^

* Added the parameter estimator to all the functions on
the module :mod:`~sklearn_genetic.callbacks`

* Build-in integration with MLflow using the :class:`~sklearn_genetic.mlflow.MLflowConfig`
and the new parameter `log_config` from :class:`~sklearn_genetic.GASearchCV`

* Implemented the callback :class:`~sklearn_genetic.callbacks.LogbookSaver`
which saves the estimator.logbook object with all the fitted hyperparameters
and their cross-validation score

* Added the parameter `estimator` to all the functions on
the module :mod:`~sklearn_genetic.callbacks`

^^^^^
Docs:
^^^^^

* Added user guide "Integrating with MLflow"

What's new in 0.4.1
-------------------

Expand Down
130 changes: 130 additions & 0 deletions docs/tutorials/mlflow.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
Integrating with MLflow
=======================

In this post, we are going to explain how setup the build-in integration
of sklearn-genetic-opt with MLflow.
To use this feature, we must set the parameters that will include
the tracking server, experiment, run name, tags and others,
the full implementation is here: :class:`~sklearn_genetic.mlflow.MLflowConfig`

Configuration
-------------

The configuration is pretty straight forward, we just need
to import the main class and define some parameters, here there is its meaning:

* **tracking_uri:** Address of local or remote tracking server.
* **experiment:** Case sensitive name of an experiment to be activated.
* **run_name:** Name of new run (stored as a mlflow.runName tag).
* **save_models:** If ``True``, it will log the estimator into mlflow artifacts
* **registry_uri:** Address of local or remote model registry server.
* **tags:** Dictionary of tags to apply.

Example
--------

In this example, we are going to log the information into a mlflow server
that is running in our local host, port 5000, we want to save each of the
trained models.

.. code:: python3
from sklearn_genetic.mlflow import MLflowConfig
mlflow_config = MLflowConfig(
tracking_uri="http://localhost:5000",
experiment="Digits-sklearn-genetic-opt",
run_name="Decision Tree",
save_models=True,
tags={"team": "sklearn-genetic-opt", "version": "0.5.0"})
Now, this config is passed to the :class:`~sklearn_genetic.GASearchCV` class
in the parameter named `log_config`, for example:

.. code:: python3
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Categorical, Integer, Continuous
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn_genetic.mlflow import MLflowConfig
data = load_digits()
label_names = data["target_names"]
y = data["target"]
X = data["data"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
clf = DecisionTreeClassifier()
params_grid = {
"min_weight_fraction_leaf": Continuous(0, 0.5),
"criterion": Categorical(["gini", "entropy"]),
"max_depth": Integer(2, 20),
"max_leaf_nodes": Integer(2, 30)}
cv = StratifiedKFold(n_splits=3, shuffle=True)
evolved_estimator = GASearchCV(
clf,
cv=cv,
scoring="accuracy",
population_size=3,
generations=5,
tournament_size=3,
elitism=True,
crossover_probability=0.9,
mutation_probability=0.05,
param_grid=params_grid,
algorithm="eaMuPlusLambda",
n_jobs=-1,
verbose=True,
log_config=mlflow_config)
evolved_estimator.fit(X_train, y_train)
y_predict_ga = evolved_estimator.predict(X_test)
accuracy = accuracy_score(y_test, y_predict_ga)
print(evolved_estimator.best_params_)
Notice that we choose a small generations and population_size, just to be
able to see the results without much verbosity.

If you go to you mlflow UI and click the experiment named "Digits-sklearn-genetic-opt"
We should see something like this (I've hidden some columns to give a better look):

.. image:: ../images/mlflow_experiment_0.png

There we can see the user that ran the experiment, the name of the file
which contained the source code, our tags and other metadata. Notice
that there is "plus" symbol that will show us each of our iterations,
this is because sklearn-genetic-opt will log each `GASearchCV.fit()` call
in a nested way, think it like a parent run, and each child is
one of the hyperparameters that were tested, for example if we run the
same code again, now we see two parents runs:

.. image:: ../images/mlflow_nested_run_1.png

Now click on any of the "plus" symbols to see all the children, now they
look like this (again edited the columns to display):

.. image:: ../images/mlflow_children_2.png

From there we can see the hyper parameters and the score (cross-validation)
that we got in each run, from there we can use the regular mlflow functionalities
like comparing runs, download the CSV, register a model, etc. You can see more
on https://mlflow.org/docs/latest/index.html

Now, as we set ``save_model=True``, you can see that the column "Model"
as a file attached as an artifact, if we click on one of those, we see
a resume of that particular execution and some utils to use right away the
model:

.. image:: ../images/mlflow_artifacts_4.png


1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
"seaborn>=0.9.0",
"deap>=1.3.1",
"pydantic>=1.8.2",
"mlflow>=1.17.0",
],
python_requires=">=3.7",
include_package_data=True,
Expand Down
11 changes: 10 additions & 1 deletion sklearn_genetic/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
from .genetic_search import GASearchCV
from .plots import plot_fitness_evolution, plot_search_space
from .callbacks import ThresholdStopping, ConsecutiveStopping, DeltaThreshold
from .callbacks import (
ThresholdStopping,
ConsecutiveStopping,
DeltaThreshold,
LogbookSaver,
)
from .mlflow import MLflowConfig

from ._version import __version__

__all__ = [
Expand All @@ -10,5 +17,7 @@
"ThresholdStopping",
"ConsecutiveStopping",
"DeltaThreshold",
"LogbookSaver",
"MLflowConfig",
"__version__",
]
2 changes: 1 addition & 1 deletion sklearn_genetic/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from deap import tools
from deap.algorithms import varAnd, varOr

from .callbacks import eval_callbacks
from .callbacks.validations import eval_callbacks


def eaSimple(
Expand Down
9 changes: 9 additions & 0 deletions sklearn_genetic/callbacks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .early_stoppers import (
DeltaThreshold,
ThresholdStopping,
ConsecutiveStopping,
)
from .loggers import LogbookSaver
from ..mlflow import MLflowConfig

__all__ = ["DeltaThreshold", "ThresholdStopping", "ConsecutiveStopping", "LogbookSaver"]

0 comments on commit 2c47bee

Please sign in to comment.