sdpython · xadupre · Aug 18, 2021 · Aug 15, 2021 · Aug 16, 2021 · Aug 16, 2021
diff --git a/_doc/notebooks/onnx_fft.ipynb b/_doc/notebooks/onnx_fft.ipynb
diff --git a/_doc/sphinxdoc/source/api/sklearn.rst b/_doc/sphinxdoc/source/api/sklearn.rst
diff --git a/_unittests/ut_onnx_conv/test_lightgbm_tree_structure.py b/_unittests/ut_onnx_conv/test_lightgbm_tree_structure.py
@@ -4,12 +4,23 @@
 import unittest
 from logging import getLogger
 import copy
+import json
+import base64
+import lzma
 import numpy
 from pandas import DataFrame
 from pyquickhelper.pycode import ExtTestCase
+
+try:
+    from pyquickhelper.pycode.unittest_cst import decompress_cst
+except ImportError:
+    decompress_cst = lambda d: json.loads(
+        lzma.decompress(base64.b64decode(b"".join(d))))
+
 from skl2onnx.common.data_types import FloatTensorType
 from sklearn.datasets import load_iris
-from mlprodict.onnx_conv.operator_converters.conv_lightgbm import modify_tree_for_rule_in_set
+from mlprodict.onnx_conv.helpers.lgbm_helper import (
+    modify_tree_for_rule_in_set, restore_lgbm_info)
 from mlprodict.onnx_conv.parsers.parse_lightgbm import MockWrappedLightGbmBoosterClassifier
 from mlprodict.onnx_conv import register_converters, to_onnx
 from mlprodict.onnxrt import OnnxInference
@@ -30,6 +41,30 @@ def count_nodes(tree, done=None):
     return nb
 
 
+def clean_tree(tree):
+    def walk_through(tree):
+        if 'tree_structure' in tree:
+            for w in walk_through(tree['tree_structure']):
+                yield w
+        yield tree
+        if 'left_child' in tree:
+            for w in walk_through(tree['left_child']):
+                yield w
+        if 'right_child' in tree:
+            for w in walk_through(tree['right_child']):
+                yield w
+
+    nodes = list(walk_through(tree3))
+    for node in nodes:
+        for k in ['split_gain', 'split_feature', 'split_index', 'leaf_count',
+                  'internal_value', 'internal_weight', 'internal_count', 'leaf_weight']:
+            if k in node:
+                del node[k]
+        for k in ['leaf_value', 'leaf_value']:
+            if k in node:
+                node[k] = 0
+
+
 tree2 = {'average_output': False,
          'feature_names': ['c1', 'c2', 'c3', 'c4'],
          'label_index': 0,
@@ -140,6 +175,29 @@ def count_nodes(tree, done=None):
          'version': 'v2'}
 
 
+# This constant by built by appling function pyquickhelper.pycode.unittest_cst.compress_cst.
+
+tree3 = decompress_cst([
+    b'/Td6WFoAAATm1rRGAgAhARYAAAB0L+Wj4Ck9A2tdAD2IiodjqVNsvcJJI6C9h2Y0CbG5b7',
+    b'OaqsqxvLBzg7BltxogYoUzxj35qbUETbBAyJeMccezEDeIKOT1GB+I50txUuc8zkWDcp/n',
+    b'kx2YhORZxAyj55pXJF/xW5aySLknuTn/5cRfSL9AGF7dHdW9k8RqP5GONWx3YvvnP0tCW0',
+    b'lGKd5caxoNFaB5tg+je6f0s6N6QQo8wqrBPtjJ7bQf50vFrpYgkQNAEZIVutpzgE9c4o1L',
+    b'Uv/vJgnhQXOpk/4hOCV2q8VG+jD9oIjPINOOZ642k2QmsdWC+l3XagJnbN9dqT/4C9ehfM',
+    b'nf6Bw5XcRXD4rtmOyUq/ocuh1WfPinlKd/Jn0YOydq1FpH+VNSUjjPUGJbJal4Pa6jcx/Y',
+    b'9mcBjp9kP1pM5wkCJ52Kv12UQ/+2j+n0rUQbbqs10iFJo4h4KB/Ie/bugBLNItmNhNhDP4',
+    b'36Q6jCsLsXlu0gTiWZfGQapR+DJIsVKHh9GeagotXpHTwYX72KrTFwIdxgf9Y2X1EUqiJV',
+    b'wXdP7GprCs9QsIvCkqW59hPNStt2tyWtlSsXsnjU5e0Jn3USVHOcbwCBSpCtFlpg8tiS9m',
+    b'Zv1TIGj9cvEk1Ke9p6bZelvtXqHJRISJ8fCVjrqTnEjyUdPaG1wmqCyz7NFEkngrBinY7e',
+    b'ZMHmO1y6IhLI1zN0kq8zBHIQeqUruYgBatPI6jI585wQ6mYCobgQc7B6Ae6XlgOthATrr2',
+    b'oDdnIeAPeUKVMXPIq9NnwlwsyNEoTddI42NiMde8jVzVm4wwwnqrmbKlJsi5LJhRQlaEFX',
+    b'etzNn7llkCSwv88gYhcaDWP3Ewchse2iQDkJ0dPZhx0FB18X6wvEcwkt/H+dzTgAYOCSkr',
+    b'T3thNkPCvQ4keiRzHiWNzLc+NAhz5NX8BXsVQFkEyf4oUkKHjy053LBmXpHM75LBhdJmFH',
+    b'vqRENHF6QgiPLAjc/1NHatYLcY0VRetr55Bp2jWU+z75P2TrMkTHFnjbOEQ3p13USzVmnq',
+    b'3d0EUvp5Q5dUPDFAIhkH+oUkgK4lX2xlyEGh+23EqQtmkjOyKj7HPHoPZo2AjASlRTc78u',
+    b'1c9nWkTbwBGbZUsMmWzyjbDe/h2Yi2GvkSkIh8UKtYDlTzpT62G9Chf5N9HEfFjQWcdCEi',
+    b'7Y3Hx86ee03jpP42ssAADRqUIMvx3yYwABhwe+UgAA2u9V4LHEZ/sCAAAAAARZWg=='])
+
+
 class TestLightGbmTreeStructure(ExtTestCase):
 
     def setUp(self):
@@ -223,7 +281,6 @@ def test_onnxrt_python_lightgbm_categorical2(self):
         self.assertEqual(nb2, 18)
 
     def test_mock_lightgbm(self):
-
         tree = copy.deepcopy(tree2)
         nb1 = sum(count_nodes(t['tree_structure']) for t in tree['tree_info'])
         model = MockWrappedLightGbmBoosterClassifier(tree)
@@ -258,6 +315,30 @@ def test_mock_lightgbm(self):
                 prob = DataFrame(pred["output_probability"]).values
                 self.assertEqual(prob.shape, (row, 2))
 
+    def test_mock_lightgbm_info(self):
+        tree = copy.deepcopy(tree3)
+        info = restore_lgbm_info(tree)
+        modify_tree_for_rule_in_set(tree, info=info)
+        expected = tree
+        tree = copy.deepcopy(tree3)
+        info = restore_lgbm_info(tree)
+        modify_tree_for_rule_in_set(tree, info=info)
+        self.assertEqual(expected, tree)
+
+    def test_mock_lightgbm_profile(self):
+        tree = copy.deepcopy(tree3)
+        info = restore_lgbm_info(tree)
+        self.assertIsInstance(info, list)
+        self.assertGreater(len(info), 1)
+
+        def g():
+            for _ in range(0, 100):
+                modify_tree_for_rule_in_set(tree, info=info)
+        p2 = self.profile(g)[1]
+        self.assertIn('cumtime', p2)
+        if __name__ == "__main__":
+            print(p2)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/_unittests/ut_onnx_conv/test_onnxrt_runtime_lightgbm.py b/_unittests/ut_onnx_conv/test_onnxrt_runtime_lightgbm.py
@@ -1,5 +1,5 @@
 """
-@brief      test log(time=3s)
+@brief      test log(time=6s)
 """
 import sys
 import unittest
@@ -200,6 +200,116 @@ def test_onnxrt_python_lightgbm_categorical_iris(self):
         values = pandas.DataFrame(got['output_probability']).values
         self.assertEqualArray(exp, values[:, 1], decimal=5)
 
+    @skipif_circleci('stuck')
+    @unittest.skipIf(sys.platform == 'darwin', 'stuck')
+    @ignore_warnings((RuntimeWarning, UserWarning))
+    def test_onnxrt_python_lightgbm_categorical_iris_booster3(self):
+        from lightgbm import LGBMClassifier, Dataset, train as lgb_train
+
+        iris = load_iris()
+        X, y = iris.data, iris.target
+        X = (X * 10).astype(numpy.int32)
+        X_train, X_test, y_train, _ = train_test_split(
+            X, y, random_state=11)
+        other_x = numpy.random.randint(
+            0, high=10, size=(1500, X_train.shape[1]))
+        X_train = numpy.vstack([X_train, other_x]).astype(dtype=numpy.int32)
+        y_train = numpy.hstack(
+            [y_train, numpy.zeros(500) + 3, numpy.zeros(500) + 4,
+             numpy.zeros(500) + 5]).astype(dtype=numpy.int32)
+        self.assertEqual(y_train.shape, (X_train.shape[0], ))
+
+        # Classic
+        gbm = LGBMClassifier()
+        gbm.fit(X_train, y_train)
+        exp = gbm.predict_proba(X_test)
+        onx = to_onnx(gbm, initial_types=[
+            ('X', Int64TensorType([None, X_train.shape[1]]))])
+        self.assertIn('ZipMap', str(onx))
+        oif = OnnxInference(onx)
+        got = oif.run({'X': X_test})
+        values = pandas.DataFrame(got['output_probability']).values
+        self.assertEqualArray(exp, values, decimal=5)
+
+        # categorical_feature=[0, 1]
+        train_data = Dataset(
+            X_train, label=y_train,
+            feature_name=['c1', 'c2', 'c3', 'c4'],
+            categorical_feature=['c1', 'c2'])
+
+        params = {
+            "boosting_type": "gbdt",
+            "learning_rate": 0.05,
+            "n_estimators": 2,
+            "objective": "binary",
+            "max_bin": 5,
+            "min_child_samples": 100,
+            'verbose': -1,
+        }
+
+        booster = lgb_train(params, train_data)
+        exp = booster.predict(X_test)
+
+        onx = to_onnx(booster, initial_types=[
+            ('X', Int64TensorType([None, X_train.shape[1]]))])
+        self.assertIn('ZipMap', str(onx))
+        oif = OnnxInference(onx)
+        got = oif.run({'X': X_test})
+        values = pandas.DataFrame(got['output_probability']).values
+        self.assertEqualArray(exp, values[:, 1], decimal=5)
+
+    @skipif_circleci('stuck')
+    @unittest.skipIf(sys.platform == 'darwin', 'stuck')
+    @ignore_warnings((RuntimeWarning, UserWarning))
+    def test_onnxrt_python_lightgbm_categorical_iris_booster3_real(self):
+        from lightgbm import LGBMClassifier, Dataset, train as lgb_train
+
+        iris = load_iris()
+        X, y = iris.data, iris.target
+        X = (X * 10).astype(numpy.float32)
+        X_train, X_test, y_train, _ = train_test_split(
+            X, y, random_state=11)
+
+        # Classic
+        gbm = LGBMClassifier()
+        gbm.fit(X_train, y_train)
+        exp = gbm.predict_proba(X_test)
+        onx = to_onnx(gbm.booster_, initial_types=[
+            ('X', FloatTensorType([None, X_train.shape[1]]))])
+        self.assertIn('ZipMap', str(onx))
+        oif = OnnxInference(onx)
+        got = oif.run({'X': X_test})
+        values = pandas.DataFrame(got['output_probability']).values
+        self.assertEqualArray(exp, values, decimal=5)
+
+        # categorical_feature=[0, 1]
+        train_data = Dataset(
+            X_train, label=y_train,
+            feature_name=['c1', 'c2', 'c3', 'c4'],
+            categorical_feature=['c1', 'c2'])
+
+        params = {
+            "boosting_type": "gbdt",
+            "learning_rate": 0.05,
+            "n_estimators": 2,
+            "objective": "multiclass",
+            "max_bin": 5,
+            "min_child_samples": 100,
+            'verbose': -1,
+            'num_class': 3,
+        }
+
+        booster = lgb_train(params, train_data)
+        exp = booster.predict(X_test)
+
+        onx = to_onnx(booster, initial_types=[
+            ('X', FloatTensorType([None, X_train.shape[1]]))])
+        self.assertIn('ZipMap', str(onx))
+        oif = OnnxInference(onx)
+        got = oif.run({'X': X_test})
+        values = pandas.DataFrame(got['output_probability']).values
+        self.assertEqualArray(exp, values, decimal=5)
+
     @skipif_circleci('stuck')
     @unittest.skipIf(sys.platform == 'darwin', 'stuck')
     @ignore_warnings((RuntimeWarning, UserWarning))
@@ -286,9 +396,10 @@ def test_lightgbm_booster_classifier(self):
                            'subsample_freq': 1, 'bagging_fraction': 0.5,
                            'feature_fraction': 0.5},
                           data)
-        model_onnx = to_onnx(model, X)
+        model_onnx = to_onnx(model, X, verbose=2, rewrite_ops=True)
         self.assertNotEmpty(model_onnx)
 
 
 if __name__ == "__main__":
+    # TestOnnxrtRuntimeLightGbm().test_lightgbm_booster_classifier()
     unittest.main()