Merge pull request #22 from rodrigo-arenas/0.5.xdev

MLflow integration
rodrigo-arenas · Jun 21, 2021 · 2c47bee · 2c47bee
2 parents f4f786b + 839334f
commit 2c47bee
Show file tree

Hide file tree

Showing 22 changed files with 452 additions and 125 deletions.
diff --git a/README.rst b/README.rst
@@ -46,6 +46,7 @@ Main Features:
 * **Algorithms**: Set of different evolutionary algorithms to use as optimization procedure
 * **Callbacks**: Custom evaluation strategies to generate Early Stopping rules
 * **Plots**: Generate pre-define plots to understand the optimization process
+* **MLflow**: Build-in integration with mlflow to log all the hyperparameters and their cv-score
 
 Usage:
 ######

diff --git a/demo/Demo_Digits_Dataset.ipynb b/demo/Demo_Digits_Dataset.ipynb
@@ -17,15 +17,15 @@
     }
    ],
    "source": [
-    "from sklearn_genetic_opt import GASearchCV\n",
+    "from sklearn_genetic import GASearchCV\n",
     "from sklearn.linear_model import SGDClassifier\n",
     "from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV\n",
+    "from sklearn_genetic.space import Categorical, Continuous\n",
     "import scipy.stats as stats\n",
     "from sklearn.utils.fixes import loguniform\n",
     "from sklearn.datasets import load_digits\n",
     "from sklearn.metrics import accuracy_score\n",
     "import numpy as np\n",
-    "import itertools\n",
     "import warnings\n",
     "warnings.filterwarnings(\"ignore\")"
    ]
@@ -266,16 +266,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "param_grid = {'l1_ratio': Continuous(0,1),\n",
+    "              'alpha': Continuous(1e-4,1),\n",
+    "              'average': Categorical([True, False])}\n",
+    "\n",
     "evolved_estimator = GASearchCV(clf,\n",
     "                    cv=3,\n",
     "                    scoring='accuracy',\n",
+    "                    param_grid=param_grid,\n",
     "                    population_size=10,\n",
     "                    generations=8,\n",
     "                    tournament_size=3,\n",
     "                    elitism=True,\n",
-    "                    continuous_parameters = {'l1_ratio':(0,1), 'alpha':(1e-4,1)},\n",
-    "                    categorical_parameters = {'average': [True, False]},\n",
-    "                    integer_parameters = {},\n",
     "                    verbose=True)"
    ]
   },
@@ -393,4 +395,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/demo/mlflow_logger.py b/demo/mlflow_logger.py
@@ -0,0 +1,62 @@
+import warnings
+from sklearn_genetic import GASearchCV
+from sklearn_genetic.space import Categorical, Integer, Continuous
+from sklearn.model_selection import train_test_split, StratifiedKFold
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.datasets import load_digits
+from sklearn.metrics import accuracy_score
+from sklearn_genetic.mlflow import MLflowConfig
+
+warnings.filterwarnings("ignore")
+
+data = load_digits()
+label_names = data["target_names"]
+y = data["target"]
+X = data["data"]
+
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.33, random_state=42
+)
+
+clf = DecisionTreeClassifier()
+
+params_grid = {
+    "min_weight_fraction_leaf": Continuous(0, 0.5),
+    "criterion": Categorical(["gini", "entropy"]),
+    "max_depth": Integer(2, 20),
+    "max_leaf_nodes": Integer(2, 30),
+}
+
+cv = StratifiedKFold(n_splits=3, shuffle=True)
+
+mlflow_config = MLflowConfig(
+    tracking_uri="http://localhost:5000",
+    experiment="Digits-sklearn-genetic-opt",
+    run_name="Decision Tree",
+    save_models=True,
+    tags={"team": "sklearn-genetic-opt", "version": "0.5.0"},
+)
+
+evolved_estimator = GASearchCV(
+    clf,
+    cv=cv,
+    scoring="accuracy",
+    population_size=4,
+    generations=10,
+    tournament_size=3,
+    elitism=True,
+    crossover_probability=0.9,
+    mutation_probability=0.05,
+    param_grid=params_grid,
+    algorithm="eaMuPlusLambda",
+    n_jobs=-1,
+    verbose=True,
+    log_config=mlflow_config,
+)
+
+evolved_estimator.fit(X_train, y_train)
+y_predict_ga = evolved_estimator.predict(X_test)
+accuracy = accuracy_score(y_test, y_predict_ga)
+
+print(evolved_estimator.best_params_)
+print("accuracy score: ", "{:.2f}".format(accuracy))
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -7,6 +7,7 @@ numpy>=1.13.3
 seaborn>=0.11.1
 deap>=1.3.1
 pydantic>=1.8.2
+mlflow==1.17.0
 black==21.5b2
 sphinx
 sphinx_gallery

diff --git a/docs/api/mlflow.rst b/docs/api/mlflow.rst
@@ -0,0 +1,6 @@
+MLflow
+------
+
+.. autoclass:: sklearn_genetic.MLflowConfig
+   :members:
+   :undoc-members: False
diff --git a/docs/images/mlflow_artifacts_4.png b/docs/images/mlflow_artifacts_4.png
diff --git a/docs/images/mlflow_children_2.png b/docs/images/mlflow_children_2.png
diff --git a/docs/images/mlflow_experiment_0.png b/docs/images/mlflow_experiment_0.png
diff --git a/docs/images/mlflow_nested_run_1.png b/docs/images/mlflow_nested_run_1.png
diff --git a/docs/index.rst b/docs/index.rst
@@ -28,6 +28,7 @@ It's advised to install sklearn-genetic using a virtual env, inside the env use:
 .. |SeabornMinVersion| replace:: 0.9.0
 .. |DEAPMinVersion| replace:: 1.3.1
 .. |PydanticMinVersion| replace:: 1.8.2
+.. |MLflowMinVersion| replace:: 1.17.0
 
 sklearn-genetic-opt requires:
 
@@ -37,6 +38,7 @@ sklearn-genetic-opt requires:
 - Seaborn (>= |SeabornMinVersion|)
 - DEAP (>= |DEAPMinVersion|)
 - Pydantic (>= |PydanticMinVersion|)
+- MLflow (>= |MLflowMinVersion|)
 
 .. toctree::
    :maxdepth: 2
@@ -46,6 +48,7 @@ sklearn-genetic-opt requires:
    tutorials/callbacks
    tutorials/custom_callback
    tutorials/understand_cv
+   tutorials/mlflow
    release_notes
 
 .. toctree::
@@ -54,12 +57,14 @@ sklearn-genetic-opt requires:
 
    api/gasearchcv
    api/callbacks
+   api/plots
+   api/mlflow
    api/space
    api/algorithms
-   api/plots
+
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: External References:
 
    external_references

diff --git a/docs/release_notes.rst b/docs/release_notes.rst
@@ -10,13 +10,23 @@ What's new in 0.5.0
 Features:
 ^^^^^^^^^
 
-* Added the parameter estimator to all the functions on
-  the module :mod:`~sklearn_genetic.callbacks`
+
+* Build-in integration with MLflow using the :class:`~sklearn_genetic.mlflow.MLflowConfig`
+  and the new parameter `log_config` from :class:`~sklearn_genetic.GASearchCV`
 
 * Implemented the callback :class:`~sklearn_genetic.callbacks.LogbookSaver`
   which saves the estimator.logbook object with all the fitted hyperparameters
   and their cross-validation score
 
+* Added the parameter `estimator` to all the functions on
+  the module :mod:`~sklearn_genetic.callbacks`
+
+^^^^^
+Docs:
+^^^^^
+
+* Added user guide "Integrating with MLflow"
+
 What's new in 0.4.1
 -------------------
 

diff --git a/docs/tutorials/mlflow.rst b/docs/tutorials/mlflow.rst
@@ -0,0 +1,130 @@
+Integrating with MLflow
+=======================
+
+In this post, we are going to explain how setup the build-in integration
+of sklearn-genetic-opt with MLflow.
+To use this feature, we must set the parameters that will include
+the tracking server, experiment, run name, tags and others,
+the full implementation is here: :class:`~sklearn_genetic.mlflow.MLflowConfig`
+
+Configuration
+-------------
+
+The configuration is pretty straight forward, we just need
+to import the main class and define some parameters, here there is its meaning:
+
+* **tracking_uri:** Address of local or remote tracking server.
+* **experiment:** Case sensitive name of an experiment to be activated.
+* **run_name:** Name of new run (stored as a mlflow.runName tag).
+* **save_models:** If ``True``, it will log the estimator into mlflow artifacts
+* **registry_uri:** Address of local or remote model registry server.
+* **tags:** Dictionary of tags to apply.
+
+Example
+--------
+
+In this example, we are going to log the information into a mlflow server
+that is running in our local host, port 5000, we want to save each of the
+trained models.
+
+.. code:: python3
+
+    from sklearn_genetic.mlflow import MLflowConfig
+
+    mlflow_config = MLflowConfig(
+        tracking_uri="http://localhost:5000",
+        experiment="Digits-sklearn-genetic-opt",
+        run_name="Decision Tree",
+        save_models=True,
+        tags={"team": "sklearn-genetic-opt", "version": "0.5.0"})
+
+Now, this config is passed to the :class:`~sklearn_genetic.GASearchCV` class
+in the parameter named `log_config`, for example:
+
+.. code:: python3
+
+    from sklearn_genetic import GASearchCV
+    from sklearn_genetic.space import Categorical, Integer, Continuous
+    from sklearn.model_selection import train_test_split, StratifiedKFold
+    from sklearn.tree import DecisionTreeClassifier
+    from sklearn.datasets import load_digits
+    from sklearn.metrics import accuracy_score
+    from sklearn_genetic.mlflow import MLflowConfig
+
+
+    data = load_digits()
+    label_names = data["target_names"]
+    y = data["target"]
+    X = data["data"]
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.33, random_state=42)
+
+    clf = DecisionTreeClassifier()
+
+    params_grid = {
+        "min_weight_fraction_leaf": Continuous(0, 0.5),
+        "criterion": Categorical(["gini", "entropy"]),
+        "max_depth": Integer(2, 20),
+        "max_leaf_nodes": Integer(2, 30)}
+
+    cv = StratifiedKFold(n_splits=3, shuffle=True)
+
+    evolved_estimator = GASearchCV(
+        clf,
+        cv=cv,
+        scoring="accuracy",
+        population_size=3,
+        generations=5,
+        tournament_size=3,
+        elitism=True,
+        crossover_probability=0.9,
+        mutation_probability=0.05,
+        param_grid=params_grid,
+        algorithm="eaMuPlusLambda",
+        n_jobs=-1,
+        verbose=True,
+        log_config=mlflow_config)
+
+    evolved_estimator.fit(X_train, y_train)
+    y_predict_ga = evolved_estimator.predict(X_test)
+    accuracy = accuracy_score(y_test, y_predict_ga)
+
+    print(evolved_estimator.best_params_)
+
+Notice that we choose a small generations and population_size, just to be
+able to see the results without much verbosity.
+
+If you go to you mlflow UI and click the experiment named "Digits-sklearn-genetic-opt"
+We should see something like this (I've hidden some columns to give a better look):
+
+.. image:: ../images/mlflow_experiment_0.png
+
+There we can see the user that ran the experiment, the name of the file
+which contained the source code, our tags and other metadata. Notice
+that there is "plus" symbol that will show us each of our iterations,
+this is because sklearn-genetic-opt will log each `GASearchCV.fit()` call
+in a nested way, think it like a parent run, and each child is
+one of the hyperparameters that were tested, for example if we run the
+same code again, now we see two parents runs:
+
+.. image:: ../images/mlflow_nested_run_1.png
+
+Now click on any of the "plus" symbols to see all the children, now they
+look like this (again edited the columns to display):
+
+.. image:: ../images/mlflow_children_2.png
+
+From there we can see the hyper parameters and the score (cross-validation)
+that we got in each run, from there we can use the regular mlflow functionalities
+like comparing runs, download the CSV, register a model, etc. You can see more
+on https://mlflow.org/docs/latest/index.html
+
+Now, as we set ``save_model=True``, you can see that the column "Model"
+as a file attached as an artifact, if we click on one of those, we see
+a resume of that particular execution and some utils to use right away the
+model:
+
+.. image:: ../images/mlflow_artifacts_4.png
+
+
diff --git a/setup.py b/setup.py
@@ -40,6 +40,7 @@
         "seaborn>=0.9.0",
         "deap>=1.3.1",
         "pydantic>=1.8.2",
+        "mlflow>=1.17.0",
     ],
     python_requires=">=3.7",
     include_package_data=True,

diff --git a/sklearn_genetic/__init__.py b/sklearn_genetic/__init__.py
@@ -1,6 +1,13 @@
 from .genetic_search import GASearchCV
 from .plots import plot_fitness_evolution, plot_search_space
-from .callbacks import ThresholdStopping, ConsecutiveStopping, DeltaThreshold
+from .callbacks import (
+    ThresholdStopping,
+    ConsecutiveStopping,
+    DeltaThreshold,
+    LogbookSaver,
+)
+from .mlflow import MLflowConfig
+
 from ._version import __version__
 
 __all__ = [
@@ -10,5 +17,7 @@
     "ThresholdStopping",
     "ConsecutiveStopping",
     "DeltaThreshold",
+    "LogbookSaver",
+    "MLflowConfig",
     "__version__",
 ]
diff --git a/sklearn_genetic/algorithms.py b/sklearn_genetic/algorithms.py
@@ -4,7 +4,7 @@
 from deap import tools
 from deap.algorithms import varAnd, varOr
 
-from .callbacks import eval_callbacks
+from .callbacks.validations import eval_callbacks
 
 
 def eaSimple(

diff --git a/sklearn_genetic/callbacks/__init__.py b/sklearn_genetic/callbacks/__init__.py
@@ -0,0 +1,9 @@
+from .early_stoppers import (
+    DeltaThreshold,
+    ThresholdStopping,
+    ConsecutiveStopping,
+)
+from .loggers import LogbookSaver
+from ..mlflow import MLflowConfig
+
+__all__ = ["DeltaThreshold", "ThresholdStopping", "ConsecutiveStopping", "LogbookSaver"]