fixes documentation and many tiny little things

sdpython · Jun 19, 2020 · fd1c437 · fd1c437
1 parent bedb534
commit fd1c437
Show file tree

Hide file tree

Showing 24 changed files with 365 additions and 47 deletions.
diff --git a/README.rst b/README.rst
@@ -103,17 +103,32 @@ a pipeline directly into C and is not much developed.
 
 **Installation**
 
-The project relies on *sklearn-onnx* which is in active
-development. Continuous integration relies on a specific
-branch of this project to benefit from the lastest changes:
+Installation from *pip* should work unless you need the latest
+development features.
+
+::
+
+    pip install mlprodict
+
+The package includes a runtime for *onnx*. That's why there
+is a limited number of dependencies. However, some features
+relies on *sklearn-onnx*, *onnxruntime*, *scikit-learn*.
+They can be installed with the following instructions:
+
+::
+
+    pip install mlprodict[all]
+
+Some functions used in that package may rely on features
+implemented in PR still pending. In that case, you should
+install *sklearn-onnx* from:
 
 ::
 
     pip install git+https://github.com/xadupre/sklearn-onnx.git@jenkins
 
-The project is currently in active development.
-It is safer to install the package directly from
-github:
+If needed, the development version should be directy installed
+from github:
 
 ::
 
@@ -125,8 +140,7 @@ the documentation are described in `config.yml
 <https://github.com/sdpython/mlprodict/blob/master/.circleci/config.yml>`_
 for Linux. When this project becomes more stable,
 it will changed to be using official releases.
-Experiments with float64 are not supported with
-``sklearn-onnx <= 1.5.0``.
 The code is available at
 `GitHub/mlprodict <https://github.com/sdpython/mlprodict/>`_
-and has `online documentation <http://www.xavierdupre.fr/app/mlprodict/helpsphinx/index.html>`_.
+and has `online documentation <http://www.xavierdupre.fr/app/
+mlprodict/helpsphinx/index.html>`_.
diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/sphinxdoc/source/conf.py
@@ -28,7 +28,7 @@
 local_template = os.path.join(os.path.abspath(
     os.path.dirname(__file__)), "phdoc_templates")
 
-set_sphinx_variables(__file__, "mlprodict", "Xavier Dupré", 2019,
+set_sphinx_variables(__file__, "mlprodict", "Xavier Dupré", 2020,
                      "readable", sphinx_readable_theme.get_html_theme_path(),
                      locals(), extlinks=dict(
                          issue=('https://github.com/sdpython/mlprodict/issues/%s', 'issue')),
@@ -78,7 +78,7 @@
     'lightgbm': 'https://lightgbm.readthedocs.io/en/latest/',
     'make_scorer': 'https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html',
     'Minkowski distance': 'https://en.wikipedia.org/wiki/Minkowski_distance',
-    'mlinsights': '',
+    'mlinsights': 'http://www.xavierdupre.fr/app/mlinsights/helpsphinx/index.html',
     'mlprodict': 'http://www.xavierdupre.fr/app/mlprodict/helpsphinx/index.html',
     'openmp': 'https://www.openmp.org/',
     'ONNX': 'https://onnx.ai/',

diff --git a/_doc/sphinxdoc/source/installation.rst b/_doc/sphinxdoc/source/installation.rst
@@ -2,17 +2,32 @@
 Installation
 ============
 
-The project relies on *sklearn-onnx* which is in active
-development. Continuous integration relies on a specific
-branch of this project to benefit from the lastest changes:
+Installation from *pip* should work unless you need the latest
+development features.
+
+::
+
+    pip install mlprodict
+
+The package includes a runtime for *onnx*. That's why there
+is a limited number of dependencies. However, some features
+relies on *sklearn-onnx*, *onnxruntime*, *scikit-learn*.
+They can be installed with the following instructions:
+
+::
+
+    pip install mlprodict[all]
+
+Some functions used in that package may rely on features
+implemented in PR still pending. In that case, you should
+install *sklearn-onnx* from:
 
 ::
 
     pip install git+https://github.com/xadupre/sklearn-onnx.git@jenkins
 
-The project is currently in active development.
-It is safer to install the package directly from
-github:
+If needed, the development version should be directy installed
+from github:
 
 ::
 
@@ -24,8 +39,6 @@ the documentation are described in `config.yml
 <https://github.com/sdpython/mlprodict/blob/master/.circleci/config.yml>`_
 for Linux. When this project becomes more stable,
 it will changed to be using official releases.
-Experiments with float64 are not supported with
-``sklearn-onnx <= 1.5.0``.
 
 .. toctree::
 

diff --git a/_doc/sphinxdoc/source/onnx_runtime.rst b/_doc/sphinxdoc/source/onnx_runtime.rst
@@ -94,7 +94,7 @@ the cause of the error if it does not work.
 .. runpython::
     :showcode:
     :rst:
-    :warningout: PendingDeprecationWarning UserWarning RuntimeWarning
+    :warningout: PendingDeprecationWarning UserWarning RuntimeWarning FutureWarning
 
     from logging import getLogger
     from pyquickhelper.loghelper import noLOG
@@ -142,6 +142,7 @@ intermediate node anymore.
 
 .. runpython::
     :showcode:
+    :warningout: FutureWarning
 
     import numpy
     from sklearn.ensemble import AdaBoostRegressor

diff --git a/_doc/sphinxdoc/source/phdoc_static/my-styles.css b/_doc/sphinxdoc/source/phdoc_static/my-styles.css
@@ -7,6 +7,14 @@ div.highlight-ipython3 pre {
 	background-color: #f8f8c8;
 }
 
+div.body ul {
+    margin: 0em 0;
+    margin-top: 0em;
+    margin-right: 0px;
+    margin-bottom: 0em;
+    margin-left: 0px;
+}
+
 .wy-nav-top {
 	background-color: #FF0040;
 }

diff --git a/_doc/sphinxdoc/source/tutorial/onnx.rst b/_doc/sphinxdoc/source/tutorial/onnx.rst
@@ -183,6 +183,40 @@ As a consequence, interdiate results cannot be seen anymore.
     oinf = OnnxInference(model_def, runtime='python_compiled')
     print(oinf.run({'X': X_test[:5]}))
 
+From scikit-learn to ONNX
++++++++++++++++++++++++++
+
+Function `skl2onnx.to_onnx <http://www.xavierdupre.fr/app/sklearn-onnx/helpsphinx/
+api_summary.html?highlight=to_onnx#skl2onnx.to_onnx>`_ is the
+main entrypoint to convert a *scikit-learn* pipeline into ONNX.
+The same function was extended in this package into
+:func:`to_onnx <mlprodict.onnx_conv.to_onnx>` to handle
+dataframes, an extended list of supported converters, scorers.
+It works exactly the same:
+
+.. runpython::
+    :showcode:
+
+    import numpy
+    from sklearn.datasets import load_iris
+    from sklearn.model_selection import train_test_split
+    from sklearn.cluster import KMeans
+    from mlprodict.onnx_conv import to_onnx
+    from mlprodict.onnxrt import OnnxInference
+
+    iris = load_iris()
+    X = iris.data.astype(numpy.float32)
+    X_train, X_test = train_test_split(X)
+    clr = KMeans(n_clusters=3)
+    clr.fit(X_train)
+
+    model_def = to_onnx(clr, X_train.astype(numpy.float32),
+                        dtype=numpy.float32,
+                        target_opset=12)
+
+    oinf = OnnxInference(model_def, runtime='python')
+    print(oinf.run({'X': X_test[:5]}))
+
 From ONNX to Python
 +++++++++++++++++++
 

diff --git a/_doc/sphinxdoc/source/tutorial/optim.rst b/_doc/sphinxdoc/source/tutorial/optim.rst
@@ -78,3 +78,109 @@ instead of its standard implementation based on operator
 `Scan <https://github.com/onnx/onnx/blob/master/docs/Operators.md#Scan>`_.
 Section :ref:`lpy-GaussianProcess` shows how much the gain
 is depending on the number of observations for this example.
+
+Other model supported cdist
++++++++++++++++++++++++++++
+
+Pairwise distances are also is all nearest neighbours models.
+That same *cdist* option is also supported for these models.
+
+Option *zipmap* for classifiers
++++++++++++++++++++++++++++++++
+
+By default, the library *sklearn-onnx* produces a list
+of dictionaries ``{label: prediction}`` but this data structure
+takes a significant time to be build. The converted
+model can stick to matrices by removing operator *ZipMap*.
+This is done by using option ``{'zipmap': False}``.
+
+.. gdot::
+    :script: DOT-SECTION
+
+    import numpy
+    from sklearn.datasets import load_iris
+    from sklearn.model_selection import train_test_split
+    from sklearn.linear_models import LogisticRegression
+    from mlprodict.onnx_conv import to_onnx
+    from mlprodict.onnxrt import OnnxInference
+
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    X_train, _, y_train, __ = train_test_split(X, y, random_state=11)
+    clr = LogisticRegression()
+    clr.fit(X_train, y_train)
+
+    model_def = to_onnx(clr, X_train, dtype=numpy.float64,
+                        options={LogisticRegression: {'zipmap': False}})
+    oinf = OnnxInference(model_def)
+    print("DOT-SECTION", oinf.to_dot())
+
+Option *raw_scores* for classifiers
++++++++++++++++++++++++++++++++++++
+
+By default, the library *sklearn-onnx* produces probabilities
+whenever it is possible for a classifier. Raw scores can usually
+be still obtained by using option ``{'raw_scores': True}``.
+
+.. gdot::
+    :script: DOT-SECTION
+
+    import numpy
+    from sklearn.datasets import load_iris
+    from sklearn.model_selection import train_test_split
+    from sklearn.linear_models import LogisticRegression
+    from mlprodict.onnx_conv import to_onnx
+    from mlprodict.onnxrt import OnnxInference
+
+    iris = load_iris()
+    X, y = iris.data, iris.target
+    X_train, _, y_train, __ = train_test_split(X, y, random_state=11)
+    clr = LogisticRegression()
+    clr.fit(X_train, y_train)
+
+    model_def = to_onnx(clr, X_train, dtype=numpy.float64,
+                        options={LogisticRegression: {
+                            'zipmap': False, 'raw_scores': True}})
+    oinf = OnnxInference(model_def)
+    print("DOT-SECTION", oinf.to_dot())
+
+Pickability and Pipeline
+++++++++++++++++++++++++
+
+The proposed way to specify options is not always pickable.
+Function ``id(model)`` depends on the execution and map an option
+to one class may be not enough to customize the conversion.
+However, it is possible to specify an option the same way
+parameters are referenced in a *scikit-learn* pipeline
+with method `get_params <https://scikit-learn.org/stable/modules/generated/
+sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline.get_params>`_.
+Following syntax are supported:
+
+::
+
+    pipe = Pipeline([('pca', PCA()), ('classifier', LogisticRegression())])
+
+    options = {'classifier': {'zipmap': False}}
+
+Or
+
+::
+
+    options = {'classifier__zipmap': False}
+
+Options applied to one model, not a pipeline as the converter
+replaces the pipeline structure by a single onnx graph.
+Following that rule, option *zipmap* would not have any impact
+if applied to a pipeline and to the last step of the pipeline.
+However, because there is no ambiguity about what the conversion
+should be, for options *zipmap* and *nocl*, the following
+options would have the same effect:
+
+::
+
+    pipe = Pipeline([('pca', PCA()), ('classifier', LogisticRegression())])
+
+    options = {id(pipe.steps[-1][1]): {'zipmap': False}}
+    options = {id(pipe): {'zipmap': False}}
+    options = {'classifier': {'zipmap': False}}
+    options = {'classifier__zipmap': False}
diff --git a/_unittests/ut_asv_benchmark/test_asv_json_text.py b/_unittests/ut_asv_benchmark/test_asv_json_text.py
@@ -129,5 +129,4 @@ def test_to_str_coordinates(self):
 
 
 if __name__ == "__main__":
-    # TestAsvJsonText().test_to_str_coordinates()
     unittest.main()
diff --git a/_unittests/ut_asv_benchmark/test_create_asv_benchmark.py b/_unittests/ut_asv_benchmark/test_create_asv_benchmark.py
@@ -211,5 +211,4 @@ def test_create_asv_benchmark_gpr(self):
 
 
 if __name__ == "__main__":
-    # TestCreateAsvBenchmark().test_create_asv_benchmark_noflat_ext()
     unittest.main()
diff --git a/_unittests/ut_onnxrt/test_onnx_inference.py b/_unittests/ut_onnxrt/test_onnx_inference.py
@@ -6,7 +6,12 @@
 import numpy
 from onnx import helper
 from onnx import TensorProto
+from sklearn.datasets import load_iris
+from sklearn.cluster import KMeans
+from sklearn.model_selection import train_test_split
 from pyquickhelper.pycode import ExtTestCase
+from pyquickhelper.loghelper import BufferedPrint
+from mlprodict.onnx_conv import to_onnx
 from mlprodict.onnxrt import OnnxInference
 
 
@@ -62,6 +67,34 @@ def test_onnx_inference_name_confusion_input(self):
         got = res['Z']
         self.assertEqualArray(exp, got, decimal=6)
 
+    def test_onnx_inference_verbose(self):
+        iris = load_iris()
+        X, y = iris.data, iris.target
+        X_train, X_test, __, _ = train_test_split(X, y, random_state=11)
+        clr = KMeans()
+        clr.fit(X_train)
+        model_def = to_onnx(clr, X_train.astype(numpy.float32))
+        for runtime in ['python', 'python_compiled']:
+            with self.subTest(runtime=runtime):
+                oinf = OnnxInference(model_def)
+                buf = BufferedPrint()
+                got = oinf.run({'X': X_test.astype(numpy.float32)},
+                               verbose=15, fLOG=buf.fprint)
+                self.assertIsInstance(got, dict)
+                res = str(buf)
+                self.assertIn('+kr', res)
+                self.assertIn('+ki', res)
+                self.assertIn('Onnx-Gemm', res)
+                self.assertIn('min=', res)
+                self.assertIn('max=', res)
+                self.assertIn('dtype=', res)
+                inp = oinf.input_names_shapes
+                self.assertIsInstance(inp, list)
+                inp = oinf.input_names_shapes_types
+                self.assertIsInstance(inp, list)
+                out = oinf.output_names_shapes
+                self.assertIsInstance(out, list)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/_unittests/ut_onnxrt/test_rt_valid_model__batch_mode.py b/_unittests/ut_onnxrt/test_rt_valid_model__batch_mode.py
@@ -129,5 +129,4 @@ def myprint(*args, **kwargs):
 
 
 if __name__ == "__main__":
-    # TestRtValidateBatchMode().test_rt_rfe()
     unittest.main()
diff --git a/_unittests/ut_onnxrt/test_rt_valid_model_gaussian_process_ort.py b/_unittests/ut_onnxrt/test_rt_valid_model_gaussian_process_ort.py
@@ -236,5 +236,4 @@ def myprint(*args, **kwargs):
 
 
 if __name__ == "__main__":
-    TestRtValidateGaussianProcessOrt().test_rt_GaussianProcessRegressor_debug_std()
     unittest.main()