diff --git a/.circleci/config.yml b/.circleci/config.yml index 5c9b0d87..c0af75d8 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2 jobs: build: docker: - - image: circleci/python:3.8.7 + - image: circleci/python:3.9.5 working_directory: ~/repo @@ -39,12 +39,6 @@ jobs: . venv/bin/activate python setup.py build_ext --inplace - - run: - name: run tests - command: | - . venv/bin/activate - coverage run --omit=tests/test_*.py -m unittest discover tests -v - - run: name: flake8 command: | @@ -53,6 +47,12 @@ jobs: python -m flake8 onnxcustom python -m flake8 examples + - run: + name: run tests + command: | + . venv/bin/activate + coverage run --omit=tests/test_*.py -m unittest discover tests -v + - run: name: coverage command: | diff --git a/.gitignore b/.gitignore index e3ed7ec6..aa155f86 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,5 @@ examples/squeezenet1.1-7.onnx tests/pipeline*.onnx temp_* examples/pipeline_lightgbm.onnx +examples/model.onnx +tests/model.onnx diff --git a/README.rst b/README.rst index 369d5986..6e9e8409 100644 --- a/README.rst +++ b/README.rst @@ -2,8 +2,8 @@ .. image:: https://circleci.com/gh/sdpython/onnxcustom/tree/master.svg?style=svg :target: https://circleci.com/gh/sdpython/onnxcustom/tree/master -.. image:: https://travis-ci.org/sdpython/onnxcustom.svg?branch=master - :target: https://travis-ci.org/sdpython/onnxcustom +.. image:: https://travis-ci.com/sdpython/onnxcustom.svg?branch=master + :target: https://travis-ci.com/sdpython/onnxcustom :alt: Build status .. image:: https://ci.appveyor.com/api/projects/status/a3sn45a2fayoxb5q?svg=true diff --git a/doc/index.rst b/doc/index.rst index 08adc8a2..09c8d45f 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -5,8 +5,8 @@ onnxcustom: deploy machine learned models .. image:: https://circleci.com/gh/sdpython/onnxcustom/tree/master.svg?style=svg :target: https://circleci.com/gh/sdpython/onnxcustom/tree/master -.. image:: https://travis-ci.org/sdpython/onnxcustom.svg?branch=master - :target: https://travis-ci.org/sdpython/onnxcustom +.. image:: https://travis-ci.com/sdpython/onnxcustom.svg?branch=master + :target: https://travis-ci.com/sdpython/onnxcustom :alt: Build status .. image:: https://ci.appveyor.com/api/projects/status/a3sn45a2fayoxb5q?svg=true diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 9f30cba5..34867134 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -16,6 +16,7 @@ involving operator not actually implemented in tutorial_1-5_external tutorial_2_new_converter tutorial_3_new_operator + tutorial_4_complex The tutorial was tested with following version: diff --git a/doc/tutorial_4_complex.rst b/doc/tutorial_4_complex.rst new file mode 100644 index 00000000..4d20ed74 --- /dev/null +++ b/doc/tutorial_4_complex.rst @@ -0,0 +1,10 @@ + +Complex Scenarios +================= + +Discrepencies may happen. Let's see some unexpected cases. + +.. toctree:: + :maxdepth: 1 + + auto_examples/plot_usparse_xgboost diff --git a/examples/plot_cbegin_opset.py b/examples/plot_cbegin_opset.py index 7bc0d82b..0fa3bdcd 100644 --- a/examples/plot_cbegin_opset.py +++ b/examples/plot_cbegin_opset.py @@ -43,7 +43,7 @@ X, y = make_blobs(n_samples=100, n_features=2) -model = IsolationForest(3) +model = IsolationForest(n_estimators=3) model.fit(X) labels = model.predict(X) diff --git a/examples/plot_usparse_xgboost.py b/examples/plot_usparse_xgboost.py new file mode 100644 index 00000000..6b1860e4 --- /dev/null +++ b/examples/plot_usparse_xgboost.py @@ -0,0 +1,301 @@ +""" +.. _example-sparse-tfidf: + +TfIdf and sparse matrices +========================= + +.. index:: xgboost, lightgbm, sparse, ensemble + +`TfidfVectorizer `_ +usually creates sparse data. If the data is sparse enough, matrices +usually stays as sparse all along the pipeline until the predictor +is trained. Sparse matrices do not consider null and missing values +as they are not present in the datasets. Because some predictors +do the difference, this ambiguity may introduces discrepencies +when converter into ONNX. This example looks into several configurations. + +.. contents:: + :local: + +Imports, setups ++++++++++++++++ + +All imports. It also registered onnx converters for :epgk:`xgboost` +and :epkg:`lightgbm`. +""" +import warnings +import numpy +import pandas +from tqdm import tqdm +from sklearn.compose import ColumnTransformer +from sklearn.datasets import load_iris +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler +from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer +from sklearn.experimental import ( # noqa + enable_hist_gradient_boosting) # noqa +from sklearn.ensemble import ( + RandomForestClassifier, HistGradientBoostingClassifier) +from xgboost import XGBClassifier +from lightgbm import LGBMClassifier +from skl2onnx.common.data_types import FloatTensorType, StringTensorType +from skl2onnx import to_onnx, update_registered_converter +from skl2onnx.sklapi import CastTransformer, ReplaceTransformer +from skl2onnx.common.shape_calculator import ( + calculate_linear_classifier_output_shapes) +from onnxmltools.convert.xgboost.operator_converters.XGBoost import ( + convert_xgboost) +from onnxmltools.convert.lightgbm.operator_converters.LightGbm import ( + convert_lightgbm) +from mlprodict.onnxrt import OnnxInference + + +update_registered_converter( + XGBClassifier, 'XGBoostXGBClassifier', + calculate_linear_classifier_output_shapes, convert_xgboost, + options={'nocl': [True, False], 'zipmap': [True, False, 'columns']}) +update_registered_converter( + LGBMClassifier, 'LightGbmLGBMClassifier', + calculate_linear_classifier_output_shapes, convert_lightgbm, + options={'nocl': [True, False], 'zipmap': [True, False]}) + + +########################################## +# Artificial datasets +# +++++++++++++++++++++++++++ +# +# Iris + a text column. + +cst = ['class zero', 'class one', 'class two'] + +data = load_iris() +X = data.data[:, :2] +y = data.target + +df = pandas.DataFrame(X) +df["text"] = [cst[i] for i in y] + + +ind = numpy.arange(X.shape[0]) +numpy.random.shuffle(ind) +X = X[ind, :].copy() +y = y[ind].copy() + + +########################################## +# Train ensemble after sparse +# +++++++++++++++++++++++++++ +# +# The example use the Iris datasets with artifical text datasets +# preprocessed with a tf-idf. `sparse_threshold=1.` avoids +# sparse matrices to be converted into dense matrices. + + +def make_pipelines(df_train, y_train, models=None, + sparse_threshold=1., replace_nan=False, + insert_replace=False, verbose=False): + + if models is None: + models = [ + RandomForestClassifier, HistGradientBoostingClassifier, + XGBClassifier, LGBMClassifier] + + pipes = [] + for model in tqdm(models): + + if model == HistGradientBoostingClassifier: + kwargs = dict(max_iter=5) + elif model == XGBClassifier: + kwargs = dict(n_estimators=5, use_label_encoder=False) + else: + kwargs = dict(n_estimators=5) + + if insert_replace: + pipe = Pipeline([ + ('union', ColumnTransformer([ + ('scale1', StandardScaler(), [0, 1]), + ('subject', + Pipeline([ + ('count', CountVectorizer()), + ('tfidf', TfidfTransformer()), + ('repl', ReplaceTransformer()), + ]), "text"), + ], sparse_threshold=sparse_threshold)), + ('cast', CastTransformer()), + ('cls', model(max_depth=3, **kwargs)), + ]) + else: + pipe = Pipeline([ + ('union', ColumnTransformer([ + ('scale1', StandardScaler(), [0, 1]), + ('subject', + Pipeline([ + ('count', CountVectorizer()), + ('tfidf', TfidfTransformer()) + ]), "text"), + ], sparse_threshold=sparse_threshold)), + ('cast', CastTransformer()), + ('cls', model(max_depth=3, **kwargs)), + ]) + + try: + pipe.fit(df_train, y_train) + except TypeError as e: + obs = dict(model=model.__name__, pipe=pipe, error=e) + pipes.append(obs) + continue + + options = {model: {'zipmap': False}} + if replace_nan: + options[TfidfTransformer] = {'nan': True} + + # convert + with warnings.catch_warnings(record=False): + warnings.simplefilter("ignore", (FutureWarning, UserWarning)) + model_onnx = to_onnx( + pipe, + initial_types=[('input', FloatTensorType([None, 2])), + ('text', StringTensorType([None, 1]))], + target_opset=12, options=options) + + with open('model.onnx', 'wb') as f: + f.write(model_onnx.SerializeToString()) + + oinf = OnnxInference(model_onnx) + inputs = {"input": df[[0, 1]].values.astype(numpy.float32), + "text": df[["text"]].values} + pred_onx = oinf.run(inputs) + + diff = numpy.abs( + pred_onx['probabilities'].ravel() - + pipe.predict_proba(df).ravel()).sum() + + if verbose: + def td(a): + if hasattr(a, 'todense'): + b = a.todense() + ind = set(a.indices) + for i in range(b.shape[1]): + if i not in ind: + b[0, i] = numpy.nan + return b + return a + + oinf = OnnxInference(model_onnx) + pred_onx2 = oinf.run(inputs) + diff2 = numpy.abs( + pred_onx2['probabilities'].ravel() - + pipe.predict_proba(df).ravel()).sum() + + if diff > 0.1: + for i, (l1, l2) in enumerate( + zip(pipe.predict_proba(df), + pred_onx['probabilities'])): + d = numpy.abs(l1 - l2).sum() + if verbose and d > 0.1: + print("\nDISCREPENCY DETAILS") + print(d, i, l1, l2) + pre = pipe.steps[0][-1].transform(df) + print("idf", pre[i].dtype, td(pre[i])) + pre2 = pipe.steps[1][-1].transform(pre) + print("cas", pre2[i].dtype, td(pre2[i])) + inter = oinf.run(inputs, intermediate=True) + onx = inter['tfidftr_norm'] + print("onx", onx.dtype, onx[i]) + onx = inter['variable3'] + + obs = dict(model=model.__name__, + discrepencies=diff, + model_onnx=model_onnx, pipe=pipe) + if verbose: + obs['discrepency2'] = diff2 + pipes.append(obs) + + return pipes + + +data_sparse = make_pipelines(df, y) +stat = pandas.DataFrame(data_sparse).drop(['model_onnx', 'pipe'], axis=1) +if 'error' in stat.columns: + print(stat.drop('error', axis=1)) +stat + +############################ +# Sparse data hurts. +# +# Dense data +# ++++++++++ +# +# Let's replace sparse data with dense by using `sparse_threshold=0.` + + +data_dense = make_pipelines(df, y, sparse_threshold=0.) +stat = pandas.DataFrame(data_dense).drop(['model_onnx', 'pipe'], axis=1) +if 'error' in stat.columns: + print(stat.drop('error', axis=1)) +stat + +#################################### +# This is much better. Let's compare how the preprocessing +# applies on the data. + +print("sparse") +print(data_sparse[-1]['pipe'].steps[0][-1].transform(df)[:2]) +print() +print("dense") +print(data_dense[-1]['pipe'].steps[0][-1].transform(df)[:2]) + +#################################### +# This shows `RandomForestClassifier +# `_, +# `XGBClassifier `_ do not process +# the same way sparse and +# dense matrix as opposed to `LGBMClassifier +# `_. +# And `HistGradientBoostingClassifier +# `_ +# fails. +# +# Dense data with nan +# +++++++++++++++++++ +# +# Let's keep sparse data in the scikit-learn pipeline but +# replace null values by nan in the onnx graph. + +data_dense = make_pipelines(df, y, sparse_threshold=1., replace_nan=True) +stat = pandas.DataFrame(data_dense).drop(['model_onnx', 'pipe'], axis=1) +if 'error' in stat.columns: + print(stat.drop('error', axis=1)) +stat + + +############################## +# Dense, 0 replaced by nan +# ++++++++++++++++++++++++ +# +# Instead of using a specific options to replace null values +# into nan values, a custom transformer called +# ReplaceTransformer is explicitely inserted into the pipeline. +# A new converter is added to the list of supported models. +# It is equivalent to the previous options except it is +# more explicit. + +data_dense = make_pipelines(df, y, sparse_threshold=1., replace_nan=False, + insert_replace=True) +stat = pandas.DataFrame(data_dense).drop(['model_onnx', 'pipe'], axis=1) +if 'error' in stat.columns: + print(stat.drop('error', axis=1)) +stat + +###################################### +# Conclusion +# ++++++++++ +# +# Unless dense arrays are used, because :epkg:`onnxruntime` +# ONNX does not support sparse yet, the conversion needs to be +# tuned depending on the model which follows the TfIdf preprocessing. diff --git a/requirements-dev.txt b/requirements-dev.txt index de1ab1a6..f60e2f44 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,11 +6,10 @@ lightgbm loky matplotlib mlinsights -mlprodict>=0.5 +mlprodict>=0.6 nbsphinx -onnx>=1.8.0 -git+https://github.com/xadupre/onnxconverter-common.git@jenkins -onnxruntime>=1.6.0 +onnxconverter-common +onnxruntime>=1.8.0 pillow py-spy pandas @@ -19,7 +18,7 @@ pyquickhelper>=1.10 pytest pytest-cov scikit-learn>=0.24 -git+https://github.com/xadupre/sklearn-onnx.git@jenkins +skl2onnx>=1.9.0 sphinx sphinxcontrib-blockdiag sphinx-gallery diff --git a/tests/test_documentation_examples.py b/tests/test_documentation_examples1.py similarity index 97% rename from tests/test_documentation_examples.py rename to tests/test_documentation_examples1.py index e6a2c9ce..8b76954e 100644 --- a/tests/test_documentation_examples.py +++ b/tests/test_documentation_examples1.py @@ -25,9 +25,9 @@ def import_source(module_file_path, module_name): return module_spec.loader.exec_module(module) -class TestDocumentationExample(unittest.TestCase): +class TestDocumentationExample1(unittest.TestCase): - def test_documentation_examples(self): + def test_documentation_examples1(self): this = os.path.abspath(os.path.dirname(__file__)) onxc = os.path.normpath(os.path.join(this, '..')) @@ -40,6 +40,8 @@ def test_documentation_examples(self): found = os.listdir(fold) tested = 0 for name in sorted(found): + if name >= "plot_u": + break if '-v' in sys.argv: if name.endswith('plot_bbegin_measure_time.py'): diff --git a/tests/test_documentation_examples2.py b/tests/test_documentation_examples2.py new file mode 100644 index 00000000..4fe88bf6 --- /dev/null +++ b/tests/test_documentation_examples2.py @@ -0,0 +1,120 @@ +""" +Tests examples from the documentation. +""" +import unittest +from distutils.version import StrictVersion +import os +import sys +import importlib +import subprocess +from datetime import datetime +import onnxruntime +from pyquickhelper.pycode import skipif_circleci + + +def import_source(module_file_path, module_name): + if not os.path.exists(module_file_path): + raise FileNotFoundError(module_file_path) + module_spec = importlib.util.spec_from_file_location( + module_name, module_file_path) + if module_spec is None: + raise FileNotFoundError( + "Unable to find '{}' in '{}', cwd='{}'.".format( + module_name, module_file_path, + os.path.abspath(__file__))) + module = importlib.util.module_from_spec(module_spec) + return module_spec.loader.exec_module(module) + + +class TestDocumentationExample2(unittest.TestCase): + + @skipif_circleci('too long') + def test_documentation_examples2(self): + + this = os.path.abspath(os.path.dirname(__file__)) + onxc = os.path.normpath(os.path.join(this, '..')) + pypath = os.environ.get('PYTHONPATH', None) + sep = ";" if sys.platform == 'win32' else ':' + pypath = "" if pypath in (None, "") else (pypath + sep) + pypath += onxc + os.environ['PYTHONPATH'] = pypath + fold = os.path.normpath(os.path.join(this, '..', 'examples')) + found = os.listdir(fold) + tested = 0 + for name in sorted(found): + if name < "plot_u": + continue + + if '-v' in sys.argv: + if name.endswith('plot_bbegin_measure_time.py'): + if __name__ == "__main__": + print("%s: skip %r" % ( + datetime.now().strftime("%d-%m-%y %H:%M:%S"), + name)) + continue + + with self.subTest(name=name): + if name.startswith("plot_") and name.endswith(".py"): + if (name == "plot_pipeline_lightgbm.py" and + StrictVersion(onnxruntime.__version__) < + StrictVersion('1.0.0')): + continue + if __name__ == "__main__" or "-v" in sys.argv: + print("%s: run %r" % ( + datetime.now().strftime("%d-%m-%y %H:%M:%S"), + name)) + sys.path.insert(0, fold) + try: + mod = import_source(fold, os.path.splitext(name)[0]) + assert mod is not None + except FileNotFoundError: + # try another way + cmds = [sys.executable, "-u", + os.path.join(fold, name)] + p = subprocess.Popen( + cmds, stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + res = p.communicate() + out, err = res + st = err.decode('ascii', errors='ignore') + if len(st) > 0 and 'Traceback' in st: + if "No such file or directory: 'dot': 'dot'" in st: + # dot not installed, this part + # is tested in onnx framework + pass + elif '"dot" not found in path.' in st: + # dot not installed, this part + # is tested in onnx framework + pass + elif "No module named 'xgboost'" in st: + # xgboost not installed on CI + pass + elif ("cannot import name 'LightGbmModelContainer'" + " from 'onnxmltools.convert.common." + "_container'") in st: + # onnxmltools not recent enough + pass + elif ('Please fix either the inputs or ' + 'the model.') in st: + # onnxruntime datasets changed in master + # branch, still the same in released + # version on pypi + pass + elif 'dot: graph is too large' in st: + # graph is too big + pass + else: + raise RuntimeError( + "Example '{}' (cmd: {} - exec_prefix=" + "'{}') failed due to\n{}" + "".format(name, cmds, sys.exec_prefix, st)) + finally: + if sys.path[0] == fold: + del sys.path[0] + tested += 1 + if tested == 0: + raise RuntimeError("No example was tested.") + + +if __name__ == "__main__": + unittest.main()