sdpython · sdpython · Jan 22, 2022 · Jan 16, 2022 · Jan 17, 2022 · Jan 17, 2022
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -49,8 +49,8 @@ jobs:
           name: flake8
           command: |
             . venv/bin/activate
-            python -m flake8 onnxcustom --max-line-length 90
-            python -m flake8 _doc/examples --max-line-length 90
+            python -m flake8 onnxcustom --max-line-length 100
+            python -m flake8 _doc/examples --max-line-length 100
 
       - run:
           name: run tests

diff --git a/.gitignore b/.gitignore
@@ -60,3 +60,4 @@ _unittests/ut_documentation/plot_linear_regression.png
 _unittests/ut_documentation/summary.csv
 _unittests/ut_documentation/_test_example.txt
 _unittests/ut_documentation/_test_example.txt
+something
diff --git a/_doc/examples/plot_funny_sigmoid.py b/_doc/examples/plot_funny_sigmoid.py
@@ -0,0 +1,160 @@
+"""
+.. _l-example-discrepencies-sigmoid:
+
+Funny discrepancies
+===================
+
+Function sigmoid is :math:`sig(x) = \\frac{1}{1 + e^{-x}}`.
+For small or high value, implementation has to do approximation
+and they are not always the same. It may be a tradeoff between
+precision and computation time...
+It is always a tradeoff.
+
+.. index:: discrepencies, sigmoid
+
+.. contents::
+    :local:
+
+
+Precision
++++++++++
+
+This section compares the precision of a couple of implementations
+of the ssigmoid function. The custom implementation is done with
+a Taylor expansion of exponential function:
+:math:`e^x \\sim 1 + x + \\frac{x^2}{2} + ... + \\frac{x^n}{n!} + o(x^n)`.
+
+"""
+import time
+import numpy
+import pandas
+from tqdm import tqdm
+from scipy.special import expit
+
+from skl2onnx.algebra.onnx_ops import OnnxSigmoid
+from skl2onnx.common.data_types import FloatTensorType
+from onnxruntime import InferenceSession
+from mlprodict.onnxrt import OnnxInference
+from onnxcustom import get_max_opset
+import matplotlib.pyplot as plt
+
+one = numpy.array([1], dtype=numpy.float64)
+
+
+def taylor_approximation_exp(x, degre=50):
+    y = numpy.zeros(x.shape, dtype=x.dtype)
+    a = numpy.ones(x.shape, dtype=x.dtype)
+    for i in range(1, degre + 1):
+        a *= x / i
+        y += a
+    return y
+
+
+def taylor_sigmoid(x, degre=50):
+    den = one + taylor_approximation_exp(-x, degre)
+    return one / (den)
+
+
+opset = get_max_opset()
+N = 300
+min_values = [-20 + float(i) * 10 / N for i in range(N)]
+data = numpy.array([0], dtype=numpy.float32)
+
+node = OnnxSigmoid('X', op_version=opset, output_names=['Y'])
+onx = node.to_onnx({'X': FloatTensorType()},
+                   {'Y': FloatTensorType()},
+                   target_opset=opset)
+rts = ['numpy', 'python', 'onnxruntime', 'taylor20', 'taylor40']
+
+oinf = OnnxInference(onx)
+sess = InferenceSession(onx.SerializeToString())
+
+graph = []
+for mv in tqdm(min_values):
+    data[0] = mv
+    for rt in rts:
+        lab = ""
+        if rt == 'numpy':
+            y = expit(data)
+        elif rt == 'python':
+            y = oinf.run({'X': data})['Y']
+            # * 1.2 to avoid curves to be superimposed
+            y *= 1.2
+            lab = "x1.2"
+        elif rt == 'onnxruntime':
+            y = sess.run(None, {'X': data})[0]
+        elif rt == 'taylor40':
+            y = taylor_sigmoid(data, 40)
+            # * 0.8 to avoid curves to be superimposed
+            y *= 0.8
+            lab = "x0.8"
+        elif rt == 'taylor20':
+            y = taylor_sigmoid(data, 20)
+            # * 0.6 to avoid curves to be superimposed
+            y *= 0.6
+            lab = "x0.6"
+        else:
+            raise AssertionError("Unknown runtime %r." % rt)
+        value = y[0]
+        graph.append(dict(rt=rt + lab, x=mv, y=value))
+
+#############################################
+# Graph.
+
+_, ax = plt.subplots(1, 1, figsize=(12, 4))
+df = pandas.DataFrame(graph)
+piv = df.pivot('x', 'rt', 'y')
+print(piv.T.head())
+piv.plot(ax=ax, logy=True)
+
+##############################################
+# :math:`log(sig(x)) = -log(1 + e^{-x})`. When *x* is very negative,
+# :math:`log(sig(x)) \\sim -x`. That explains the graph.
+# We also see :epkg:`onnxruntime` is less precise for these values.
+# What's the benefit?
+#
+# Computation time
+# ++++++++++++++++
+
+graph = []
+for mv in tqdm(min_values):
+    data = numpy.array([mv] * 10000, dtype=numpy.float32)
+    for rt in rts:
+        begin = time.perf_counter()
+        if rt == 'numpy':
+            y = expit(data)
+        elif rt == 'python':
+            y = oinf.run({'X': data})['Y']
+        elif rt == 'onnxruntime':
+            y = sess.run(None, {'X': data})[0]
+        elif rt == 'taylor40':
+            y = taylor_sigmoid(data, 40)
+        elif rt == 'taylor20':
+            y = taylor_sigmoid(data, 20)
+        else:
+            raise AssertionError("Unknown runtime %r." % rt)
+        duration = time.perf_counter() - begin
+        graph.append(dict(rt=rt, x=mv, y=duration))
+
+_, ax = plt.subplots(1, 1, figsize=(12, 4))
+df = pandas.DataFrame(graph)
+piv = df.pivot('x', 'rt', 'y')
+piv.plot(ax=ax, logy=True)
+
+#############################################
+# Conclusion
+# ++++++++++
+#
+# The implementation from :epkg:`onnxruntime` is faster but
+# is much less contiguous for extremes. It explains why
+# probabilities may be much different when an observation
+# is far from every classification border. In that case,
+# :epkg:`onnxruntime` implementation of the sigmoid function
+# returns 0 when :func:`numpy.sigmoid` returns a smoother value.
+# Probabilites of logistic regression are obtained after the raw
+# scores are transformed with the sigmoid function and
+# normalized. If the raw scores are very negative,
+# the sum of probabilities becomes null with :epkg:`onnxruntime`.
+# The normalization fails.
+
+# plt.show()
diff --git a/_doc/sphinxdoc/source/api/training.rst b/_doc/sphinxdoc/source/api/training.rst
@@ -26,7 +26,7 @@ BaseEstimator
 
 Ancestor to both classes wrapping :epkg:`onnxruntime` API.
 
-.. autosignature:: onnxcustom.training.base_estimator.BaseEstimator
+.. autosignature:: onnxcustom.training._base_estimator.BaseEstimator
     :members:
 
 Exceptions
@@ -52,6 +52,8 @@ be combination of L1, L2 losses and L1, L2 penalties.
 
 .. autosignature:: onnxcustom.utils.orttraining_helper.get_train_initializer
 
+.. autosignature:: onnxcustom.utils.onnx_rewriter.onnx_rewrite_operator
+
 .. _l-api-prt-gradient-optimizer:
 
 OrtGradientOptimizer
@@ -100,11 +102,14 @@ LearningLoss
 .. autosignature:: onnxcustom.training.sgd_learning_loss.ElasticLearningLoss
     :members:
 
+.. autosignature:: onnxcustom.training.sgd_learning_loss.NegLogLearningLoss
+    :members:
+
 .. autosignature:: onnxcustom.training.sgd_learning_loss.SquareLearningLoss
     :members:
 
-Loss function
-+++++++++++++
+Loss functions
+++++++++++++++
 
 .. autosignature:: onnxcustom.utils.onnx_function.function_onnx_graph
 

diff --git a/_doc/sphinxdoc/source/tutorial_onnxruntime/training_ort_api.rst b/_doc/sphinxdoc/source/tutorial_onnxruntime/training_ort_api.rst
@@ -53,7 +53,7 @@ optimized weights.
 
 :epkg:`onnxruntime-training` does not implement loss functions.
 That must be done independently. That's what function
-:func:`onnxcustom.utils.orttraining_helper.add_loss_output>` does.
+:func:`onnxcustom.utils.orttraining_helper.add_loss_output` does.
 It implements a couple of usual losses in ONNX.
 Another function :func:`onnxcustom.utils.orttraining_helper.get_train_initializer`
 guesses all the coefficients of an ONNX graph if the user does not specify any.

diff --git a/_doc/sphinxdoc/source/tutorial_skl/tutorial_1_simple.rst b/_doc/sphinxdoc/source/tutorial_skl/tutorial_1_simple.rst
@@ -21,6 +21,7 @@ used in the ONNX graph.
     ../gyexamples/plot_dbegin_options_list
     ../gyexamples/plot_dbegin_options_zipmap
     ../gyexamples/plot_ebegin_float_double
+    ../gyexamples/plot_funny_sigmoid
     ../gyexamples/plot_fbegin_investigate
     ../gyexamples/plot_gbegin_dataframe
     ../gyexamples/plot_gbegin_transfer_learning

diff --git a/_doc/sphinxdoc/source/tutorial_training/tutorial_6_training_partial.rst b/_doc/sphinxdoc/source/tutorial_training/tutorial_6_training_partial.rst
@@ -117,11 +117,11 @@ Cache
 +++++
 
 Base class :class:`BaseLearningOnnx
-<onnxcustom.training.base_onnx_function.BaseLearningOnnx>` implements
+<onnxcustom.training._base_onnx_function.BaseLearningOnnx>` implements
 methods :meth:`_bind_input_ortvalue
-<onnxcustom.training.base_onnx_function.BaseLearningOnnx._bind_input_ortvalue>`
+<onnxcustom.training._base_onnx_function.BaseLearningOnnx._bind_input_ortvalue>`
 and :meth:`_bind_output_ortvalue
-<onnxcustom.training.base_onnx_function.BaseLearningOnnx._bind_output_ortvalue>`
+<onnxcustom.training._base_onnx_function.BaseLearningOnnx._bind_output_ortvalue>`
 used by the three components mentioned above. They cache the binded pointers
 (the value returns by `c_ortvalue.data_ptr()` and do not bind again
 if the method is called again with a different `OrtValue` but a same pointer

diff --git a/_unittests/ut_module/test_onnx_runtimes.py b/_unittests/ut_module/test_onnx_runtimes.py
@@ -0,0 +1,42 @@
+"""
+@brief      test log(time=0s)
+"""
+import unittest
+import numpy
+from scipy.special import expit  # pylint: disable=E0611
+from pyquickhelper.pycode import ExtTestCase
+from skl2onnx.algebra.onnx_ops import OnnxSigmoid  # pylint: disable=E0611
+from skl2onnx.common.data_types import FloatTensorType
+from mlprodict.onnxrt import OnnxInference
+from onnxcustom import get_max_opset
+
+
+class TestOnnxRuntimes(ExtTestCase):
+    """Test style."""
+
+    def test_check(self):
+        opset = get_max_opset()
+        min_values = [-41.621277, -40.621277, -30.621277, -20.621277,
+                      -19, -18, -17, -15, -14, -13, -12, -11, -10, -5, -2]
+        data = numpy.array(
+            [[0]],
+            dtype=numpy.float32)
+
+        node = OnnxSigmoid('X', op_version=opset, output_names=['Y'])
+        onx = node.to_onnx({'X': FloatTensorType()},
+                           {'Y': FloatTensorType()},
+                           target_opset=opset)
+        rts = ['numpy', 'python', 'onnxruntime1']
+        for mv in min_values:
+            data[:, 0] = mv
+            for rt in rts:
+                if rt == 'numpy':
+                    y = expit(data)
+                else:
+                    oinf = OnnxInference(onx, runtime=rt)
+                    y = oinf.run({'X': data})['Y']
+                self.assertNotEmpty(y)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/_unittests/ut_training/test_optimizers.py b/_unittests/ut_training/test_optimizers.py
@@ -1,5 +1,5 @@
 """
-@brief      test log(time=8s)
+@brief      test log(time=9s)
 """
 import os
 import unittest

diff --git a/_unittests/ut_training/test_optimizers_classification.py b/_unittests/ut_training/test_optimizers_classification.py
@@ -0,0 +1,95 @@
+"""
+@brief      test log(time=8s)
+"""
+import unittest
+from pyquickhelper.pycode import ExtTestCase, get_temp_folder
+import numpy
+from onnx.helper import set_model_props
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import SGDClassifier
+from mlprodict.onnx_conv import to_onnx
+from mlprodict.plotting.text_plot import onnx_simple_text_plot
+from mlprodict.onnx_tools.onnx_manipulations import select_model_inputs_outputs
+# from mlprodict.onnxrt import OnnxInference
+from onnxcustom import __max_supported_opset__ as opset
+try:
+    from onnxruntime import TrainingSession
+except ImportError:
+    # onnxruntime not training
+    TrainingSession = None
+
+
+class TestOptimizersClassification(ExtTestCase):
+
+    @unittest.skipIf(TrainingSession is None, reason="not training")
+    def test_ort_gradient_optimizers_binary(self):
+        from onnxcustom.utils.orttraining_helper import add_loss_output
+        from onnxcustom.training.optimizers import OrtGradientOptimizer
+        X, y = make_classification(  # pylint: disable=W0632
+            100, n_features=10, random_state=0)
+        X = X.astype(numpy.float32)
+        y = y.astype(numpy.int64)
+        X_train, _, y_train, __ = train_test_split(X, y)
+        reg = SGDClassifier(loss='log')
+        reg.fit(X_train, y_train)
+        onx = to_onnx(reg, X_train, target_opset=opset,
+                      black_op={'LinearClassifier'},
+                      options={'zipmap': False})
+        set_model_props(onx, {'info': 'unit test'})
+        onx_loss = add_loss_output(onx, 'log', output_index=1)
+        inits = ['intercept', 'coef']
+        train_session = OrtGradientOptimizer(
+            onx_loss, inits, learning_rate=1e-3)
+        self.assertRaise(lambda: train_session.get_state(), AttributeError)
+        train_session.fit(X_train, y_train.reshape((-1, 1)), use_numpy=True)
+        state_tensors = train_session.get_state()
+        self.assertEqual(len(state_tensors), 2)
+        r = repr(train_session)
+        self.assertIn("OrtGradientOptimizer(model_onnx=", r)
+        self.assertIn("learning_rate='invscaling'", r)
+        losses = train_session.train_losses_
+        self.assertGreater(len(losses), 1)
+        self.assertFalse(any(map(numpy.isnan, losses)))
+
+    @unittest.skipIf(TrainingSession is None, reason="not training")
+    def test_ort_gradient_optimizers_fw_nesterov_binary(self):
+        from onnxcustom.training.optimizers_partial import (
+            OrtGradientForwardBackwardOptimizer)
+        from onnxcustom.training.sgd_learning_rate import (
+            LearningRateSGDNesterov)
+        from onnxcustom.training.sgd_learning_loss import NegLogLearningLoss
+        X, y = make_classification(  # pylint: disable=W0632
+            100, n_features=10, random_state=0)
+        X = X.astype(numpy.float32)
+        y = y.astype(numpy.int64)
+        X_train, _, y_train, __ = train_test_split(X, y)
+        reg = SGDClassifier(loss='log')
+        reg.fit(X_train, y_train)
+        onx = to_onnx(reg, X_train, target_opset=opset,
+                      black_op={'LinearRegressor'},
+                      options={'zipmap': False,
+                               'raw_scores': True})
+        onx = select_model_inputs_outputs(onx, outputs=['score'])
+        self.assertIn("output: name='score'",
+                      onnx_simple_text_plot(onx))
+        set_model_props(onx, {'info': 'unit test'})
+        inits = ['coef', 'intercept']
+
+        train_session = OrtGradientForwardBackwardOptimizer(
+            onx, inits,
+            learning_rate=LearningRateSGDNesterov(
+                1e-4, nesterov=False, momentum=0.9),
+            learning_loss=NegLogLearningLoss(),
+            warm_start=False, max_iter=100, batch_size=10)
+        self.assertIsInstance(train_session.learning_loss, NegLogLearningLoss)
+        self.assertEqual(train_session.learning_loss.eps, 1e-5)
+        train_session.fit(X, y)
+        temp = get_temp_folder(
+            __file__, "temp_ort_gradient_optimizers_fw_nesterov_binary")
+        train_session.save_onnx_graph(temp)
+
+
+if __name__ == "__main__":
+    TestOptimizersClassification().test_ort_gradient_optimizers_fw_nesterov_binary()
+    unittest.main()
diff --git a/_unittests/ut_training/test_orttraining_forward_backward.py b/_unittests/ut_training/test_orttraining_forward_backward.py
@@ -401,4 +401,4 @@ def test_forward_training_logreg(self):
 
 if __name__ == "__main__":
     # TestOrtTrainingForwardBackward().forward_no_training(verbose=True)
-    unittest.main(verbosity=2)
+    unittest.main(verbosity=2, failfast=True)