From 0b5c886be7dbf5683a52606e149cfc2d93d2d33c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Wed, 30 Jun 2021 12:44:58 +0200 Subject: [PATCH 1/3] bool --> bool_, more type verifications --- _doc/examples/plot_op_where.py | 2 +- .../ut__skl2onnx/test_sklearn_pipeline.py | 4 +- _unittests/ut_npy/test_onnx_variable.py | 4 +- _unittests/ut_npy/test_onnx_variable_ort.py | 4 +- _unittests/ut_npy/test_onnx_variable_tuple.py | 2 +- _unittests/ut_npy/test_wrappers.py | 2 +- .../ut_onnxrt/test_onnxrt_python_runtime_.py | 2 +- .../test_onnxrt_python_runtime_ml.py | 40 ++++++++++++++++++- .../ut_onnxrt/test_onnxrt_validate_type.py | 2 +- mlprodict/grammar_sklearn/grammar/gtypes.py | 2 +- mlprodict/npy/onnx_numpy_annotation.py | 4 +- mlprodict/npy/onnx_variable.py | 2 +- mlprodict/onnx_conv/convert.py | 4 +- mlprodict/onnx_tools/onnx2py_helper.py | 4 +- mlprodict/onnxrt/ops_cpu/_op.py | 32 ++++++++++++--- .../onnxrt/ops_cpu/_op_classifier_string.py | 2 +- mlprodict/onnxrt/ops_cpu/_op_helper.py | 8 ++-- mlprodict/onnxrt/ops_cpu/op_cast.py | 4 +- mlprodict/onnxrt/ops_cpu/op_constant.py | 15 +++++++ .../onnxrt/ops_cpu/op_constant_of_shape.py | 2 +- .../onnxrt/ops_cpu/op_dict_vectorizer.py | 2 +- mlprodict/onnxrt/ops_cpu/op_label_encoder.py | 4 +- .../onnxrt/ops_cpu/op_one_hot_encoder.py | 2 +- mlprodict/onnxrt/ops_cpu/op_zipmap.py | 4 ++ mlprodict/onnxrt/shape_object.py | 6 +-- .../test_utils/utils_backend_common.py | 6 +-- 26 files changed, 122 insertions(+), 43 deletions(-) diff --git a/_doc/examples/plot_op_where.py b/_doc/examples/plot_op_where.py index f799285f4..43ceaca9a 100644 --- a/_doc/examples/plot_op_where.py +++ b/_doc/examples/plot_op_where.py @@ -92,7 +92,7 @@ def benchmark_equation(): repeat = 5 number = 10 - conds = [(numpy.random.rand(dim, dim) < 0.5).astype(numpy.bool) + conds = [(numpy.random.rand(dim, dim) < 0.5).astype(numpy.bool_) for _ in range(repeat)] xs = [numpy.random.rand(dim, dim).astype(numpy.float32) for _ in range(repeat)] diff --git a/_unittests/ut__skl2onnx/test_sklearn_pipeline.py b/_unittests/ut__skl2onnx/test_sklearn_pipeline.py index a37d26e70..140e36ea9 100644 --- a/_unittests/ut__skl2onnx/test_sklearn_pipeline.py +++ b/_unittests/ut__skl2onnx/test_sklearn_pipeline.py @@ -298,9 +298,9 @@ def convert_dataframe_schema(df, drop=None): data_types = { 'pclass': numpy.int64, 'age': numpy.float32, - 'sex': numpy.str, + 'sex': numpy.str_, 'fare': numpy.float32, - 'embarked': numpy.str, + 'embarked': numpy.str_, } inputs = {k: data[k].values.astype(data_types[k]).reshape(-1, 1) for k in data.columns} diff --git a/_unittests/ut_npy/test_onnx_variable.py b/_unittests/ut_npy/test_onnx_variable.py index 7b9a5ae1a..b69118ad5 100644 --- a/_unittests/ut_npy/test_onnx_variable.py +++ b/_unittests/ut_npy/test_onnx_variable.py @@ -15,7 +15,7 @@ @ignore_warnings(DeprecationWarning) def get_bool(unused): try: - return numpy.bool + return numpy.bool_ except AttributeError: return bool @@ -258,7 +258,7 @@ def test_abs_neg(x: NDArray[Any, numpy.float32], @onnxnumpy_default def test_abs_not(x: NDArray[Any, numpy.float32], - ) -> NDArray[Any, numpy.bool]: + ) -> NDArray[Any, numpy.bool_]: "onnx numpy not" temp = nxnp.abs(x) > numpy.float32(0) return temp.not_() diff --git a/_unittests/ut_npy/test_onnx_variable_ort.py b/_unittests/ut_npy/test_onnx_variable_ort.py index 2ad277822..abead9145 100644 --- a/_unittests/ut_npy/test_onnx_variable_ort.py +++ b/_unittests/ut_npy/test_onnx_variable_ort.py @@ -16,7 +16,7 @@ @ignore_warnings(DeprecationWarning) def get_bool(unused): try: - return numpy.bool + return numpy.bool_ except AttributeError: return bool @@ -245,7 +245,7 @@ def test_abs_neg(x: NDArray[Any, numpy.float32], @onnxnumpy(runtime='onnxruntime1') def test_abs_not(x: NDArray[Any, numpy.float32], - ) -> NDArray[Any, numpy.bool]: + ) -> NDArray[Any, numpy.bool_]: "onnx numpy not" temp = nxnp.abs(x) > numpy.float32(0) return temp.not_() diff --git a/_unittests/ut_npy/test_onnx_variable_tuple.py b/_unittests/ut_npy/test_onnx_variable_tuple.py index 4d36c438c..f2e0bd426 100644 --- a/_unittests/ut_npy/test_onnx_variable_tuple.py +++ b/_unittests/ut_npy/test_onnx_variable_tuple.py @@ -15,7 +15,7 @@ @ignore_warnings(DeprecationWarning) def get_bool(unused): try: - return numpy.bool + return numpy.bool_ except AttributeError: return bool diff --git a/_unittests/ut_npy/test_wrappers.py b/_unittests/ut_npy/test_wrappers.py index 03572c245..76bebbeb7 100644 --- a/_unittests/ut_npy/test_wrappers.py +++ b/_unittests/ut_npy/test_wrappers.py @@ -138,7 +138,7 @@ def test_signature(self): # sig, args, kwargs, version f32 = numpy.float32 i64 = numpy.int64 - bbb = numpy.bool + bbb = numpy.bool_ sigs = [ # 0 (NDArraySameTypeSameShape("all"), ['X'], {}, diff --git a/_unittests/ut_onnxrt/test_onnxrt_python_runtime_.py b/_unittests/ut_onnxrt/test_onnxrt_python_runtime_.py index c15ed95fd..c7032ee9d 100644 --- a/_unittests/ut_onnxrt/test_onnxrt_python_runtime_.py +++ b/_unittests/ut_onnxrt/test_onnxrt_python_runtime_.py @@ -3916,5 +3916,5 @@ def test_op_constant(self): if __name__ == "__main__": # Working - # TestOnnxrtPythonRuntime().test_onnxt_runtime_concat() + # TestOnnxrtPythonRuntime().test_onnxt_runtime_and() unittest.main() diff --git a/_unittests/ut_onnxrt/test_onnxrt_python_runtime_ml.py b/_unittests/ut_onnxrt/test_onnxrt_python_runtime_ml.py index a50d386a2..6bbf79514 100644 --- a/_unittests/ut_onnxrt/test_onnxrt_python_runtime_ml.py +++ b/_unittests/ut_onnxrt/test_onnxrt_python_runtime_ml.py @@ -13,7 +13,7 @@ from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier from sklearn.preprocessing import StandardScaler, Binarizer -from pyquickhelper.pycode import ExtTestCase +from pyquickhelper.pycode import ExtTestCase, ignore_warnings from skl2onnx import convert_sklearn from skl2onnx.common.data_types import ( FloatTensorType, StringTensorType, DictionaryType) @@ -28,6 +28,30 @@ def setUp(self): logger = getLogger('skl2onnx') logger.disabled = True + def common_expected_shapes_types(self, oinf, got, model_def, + raise_shape=False): + expected_types = oinf.infer_types() + self.assertEqual(set(got) & set(expected_types), set(got)) + for k, v in got.items(): + if expected_types[k] in (str, numpy.str_): + # Type mismatch: dtype(' + continue + if v.dtype != expected_types[k]: + raise AssertionError( + "Type mismatch: %r != %r\nexpected_types=%r\ngot=%r" + "\n----\n%r" % ( + v.dtype, expected_types[k], expected_types, got, + model_def)) + + try: + expected_shapes = oinf.infer_shapes() + self.assertEqual(set(got) & set(expected_shapes), set(got)) + except RuntimeError as e: + if raise_shape: + raise e + warnings.warn("infer_shapes fails.") + + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_KMeans(self): iris = load_iris() X, y = iris.data, iris.target @@ -39,11 +63,13 @@ def test_onnxrt_python_KMeans(self): oinf = OnnxInference(model_def) got = oinf.run({'X': X_test.astype(numpy.float32)}) self.assertEqual(list(sorted(got)), ['label', 'scores']) + self.common_expected_shapes_types(oinf, got, model_def) exp = clr.predict(X_test) self.assertEqualArray(exp, got['label']) exp = clr.transform(X_test) self.assertEqualArray(exp, got['scores'], decimal=4) + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_KMeans_verbose(self): iris = load_iris() X, y = iris.data, iris.target @@ -67,6 +93,7 @@ def myprint(*args, **kwargs): self.assertEqualArray(exp, got['scores'], decimal=4) self.assertGreater(len(rows), 2) + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_KNeighborsClassifier(self): iris = load_iris() X, y = iris.data, iris.target @@ -87,6 +114,7 @@ def test_onnxrt_python_KNeighborsClassifier(self): got = pandas.DataFrame(list(y['output_probability'])).values self.assertEqualArray(exp, got, decimal=5) + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_KNeighborsRegressor_simple_k1(self): X = numpy.array([[0, 1], [0.2, 1.2], [1, 2], [ 1.2, 2.2]], dtype=numpy.float32) @@ -108,6 +136,7 @@ def test_onnxrt_python_KNeighborsRegressor_simple_k1(self): self.assertEqualArray( exp.ravel(), y['variable'].ravel(), decimal=6) + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_KNeighborsRegressor_simple_k2(self): X = numpy.array([[0, 1], [0.2, 1.2], [1, 2], [ 1.2, 2.2]], dtype=numpy.float32) @@ -129,6 +158,7 @@ def test_onnxrt_python_KNeighborsRegressor_simple_k2(self): self.assertEqualArray( exp.ravel(), y['variable'].ravel(), decimal=6) + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_KNeighborsRegressor(self): iris = load_iris() X, y = iris.data, iris.target @@ -154,6 +184,7 @@ def test_onnxrt_python_KNeighborsRegressor(self): raise AssertionError( "Something is wrong with i={}".format(i)) from e + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_LinearRegression(self): iris = load_iris() X, y = iris.data, iris.target @@ -173,6 +204,7 @@ def test_onnxrt_python_LinearRegression(self): self.assertIn('op_type=LinearRegressor', text) self.assertIn("post_transform=b'NONE'", text) + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_LogisticRegression_binary(self): iris = load_iris() X, y = iris.data, iris.target @@ -193,6 +225,7 @@ def test_onnxrt_python_LogisticRegression_binary(self): got = pandas.DataFrame(list(y['output_probability'])).values self.assertEqualArray(exp, got, decimal=5) + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_LogisticRegression_multi(self): iris = load_iris() X, y = iris.data, iris.target @@ -212,6 +245,7 @@ def test_onnxrt_python_LogisticRegression_multi(self): got = pandas.DataFrame(list(y['output_probability'])).values self.assertEqualArray(exp, got, decimal=5) + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_StandardScaler(self): iris = load_iris() X, y = iris.data, iris.target @@ -226,6 +260,7 @@ def test_onnxrt_python_StandardScaler(self): exp = clr.transform(X_test) self.assertEqualArray(exp, got['variable'], decimal=6) + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_Binarizer(self): iris = load_iris() X, y = iris.data, iris.target @@ -240,6 +275,7 @@ def test_onnxrt_python_Binarizer(self): exp = clr.transform(X_test) self.assertEqualArray(exp, got['variable'], decimal=6) + @ignore_warnings(DeprecationWarning) def test_dict_vectorizer(self): model = DictVectorizer() data = [{"amy": 1.0, "chin": 200.0}, {"nice": 3.0, "amy": 1.0}] @@ -253,6 +289,7 @@ def test_dict_vectorizer(self): self.assertEqual(list(sorted(got)), ['variable']) self.assertEqualArray(exp.todense(), got['variable'].todense()) + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_SimpleImputer(self): iris = load_iris() X, y = iris.data, iris.target @@ -270,6 +307,7 @@ def test_onnxrt_python_SimpleImputer(self): self.assertEqualArray(exp, got['variable'], decimal=6) self.assertRaise(lambda: oinf.run({'X': X_test[0]}), RuntimeError) + @ignore_warnings(DeprecationWarning) def test_onnxrt_python_SimpleImputer_int(self): iris = load_iris() X, y = iris.data, iris.target diff --git a/_unittests/ut_onnxrt/test_onnxrt_validate_type.py b/_unittests/ut_onnxrt/test_onnxrt_validate_type.py index 566f672e3..9ab88a1f3 100644 --- a/_unittests/ut_onnxrt/test_onnxrt_validate_type.py +++ b/_unittests/ut_onnxrt/test_onnxrt_validate_type.py @@ -87,7 +87,7 @@ def filter_scenario(m, p, o, e, e2): row["odtypes"] = dtypes for dt in dtypes: - if dt in (dtype, numpy.int32, numpy.int64, numpy.str): + if dt in (dtype, numpy.int32, numpy.int64, numpy.str_): continue raise AssertionError( 'Issue with one model {}-{}-{} ({})\n----\n{}\n---\n{}'.format( diff --git a/mlprodict/grammar_sklearn/grammar/gtypes.py b/mlprodict/grammar_sklearn/grammar/gtypes.py index 4016cc9bb..55dc1b6b4 100644 --- a/mlprodict/grammar_sklearn/grammar/gtypes.py +++ b/mlprodict/grammar_sklearn/grammar/gtypes.py @@ -173,7 +173,7 @@ class MLNumTypeBool(MLNumTypeSingle): """ def __init__(self): - MLNumTypeSingle.__init__(self, numpy.bool, 'BL', 'bool', 'bool') + MLNumTypeSingle.__init__(self, numpy.bool_, 'BL', 'bool', 'bool') class MLTensor(MLType): diff --git a/mlprodict/npy/onnx_numpy_annotation.py b/mlprodict/npy/onnx_numpy_annotation.py index c49ce4eb2..056ad94dd 100644 --- a/mlprodict/npy/onnx_numpy_annotation.py +++ b/mlprodict/npy/onnx_numpy_annotation.py @@ -16,7 +16,7 @@ numpy_bool = bool try: - numpy_str = numpy.str + numpy_str = numpy.str_ except AttributeError: # pragma: no cover numpy_str = str @@ -222,7 +222,7 @@ def __repr__(self): def _to_onnx_dtype(self, dtype, shape): from skl2onnx.common.data_types import _guess_numpy_type if dtype == numpy.bool_: - dtype = numpy.bool + dtype = numpy.bool_ return _guess_numpy_type(dtype, shape) def _get_output_types(self, key): diff --git a/mlprodict/npy/onnx_variable.py b/mlprodict/npy/onnx_variable.py index baa49851b..bbd44e5ac 100644 --- a/mlprodict/npy/onnx_variable.py +++ b/mlprodict/npy/onnx_variable.py @@ -36,7 +36,7 @@ except AttributeError: # pragma: no cover numpy_bool = bool try: - numpy_str = numpy.str + numpy_str = numpy.str_ except AttributeError: # pragma: no cover numpy_str = str diff --git a/mlprodict/onnx_conv/convert.py b/mlprodict/onnx_conv/convert.py index 82526fe5f..681795cb3 100644 --- a/mlprodict/onnx_conv/convert.py +++ b/mlprodict/onnx_conv/convert.py @@ -102,7 +102,7 @@ def guess_initial_types(X, initial_types): if isinstance(X, pandas.DataFrame): initial_types = [] for c in X.columns: - if isinstance(X[c].values[0], (str, numpy.str)): + if isinstance(X[c].values[0], (str, numpy.str_)): g = StringTensorType() else: g = _guess_type(X[c].values) @@ -167,7 +167,7 @@ def _cast_data(X, ct): if isinstance(ct, DoubleTensorType): return X.astype(numpy.float64) if isinstance(ct, StringTensorType): - return X.astype(numpy.str) + return X.astype(numpy.str_) if isinstance(ct, Int64TensorType): return X.astype(numpy.int64) raise RuntimeError( # pragma: no cover diff --git a/mlprodict/onnx_tools/onnx2py_helper.py b/mlprodict/onnx_tools/onnx2py_helper.py index da585b8aa..212dd7a59 100644 --- a/mlprodict/onnx_tools/onnx2py_helper.py +++ b/mlprodict/onnx_tools/onnx2py_helper.py @@ -475,9 +475,9 @@ def guess_proto_dtype(dtype): return TensorProto.UINT8 # pylint: disable=E1101 if dtype == numpy.float16: return TensorProto.FLOAT16 # pylint: disable=E1101 - if dtype in (numpy.bool, bool, numpy.bool_): + if dtype in (bool, numpy.bool_): return TensorProto.BOOL # pylint: disable=E1101 - if dtype in (numpy.str, str, numpy.str_): + if dtype in (str, numpy.str_): return TensorProto.STRING # pylint: disable=E1101 raise RuntimeError( "Unable to guess type for dtype={}.".format(dtype)) # pragma: no cover diff --git a/mlprodict/onnxrt/ops_cpu/_op.py b/mlprodict/onnxrt/ops_cpu/_op.py index 8db4b55e5..905ce3ea2 100644 --- a/mlprodict/onnxrt/ops_cpu/_op.py +++ b/mlprodict/onnxrt/ops_cpu/_op.py @@ -152,13 +152,34 @@ def run(self, *args, **kwargs): # pylint: disable=E0202 """ Calls method ``_run``. """ + for ar in args: + a = ar.dtype + if not isinstance(a, numpy.dtype) and a not in { + numpy.int8, numpy.uint8, numpy.float16, numpy.float32, + numpy.float64, numpy.int32, numpy.int64, numpy.int16, + numpy.uint16, numpy.uint32, numpy.bool_, numpy.str_, + numpy.uint64, bool, str, }: + raise TypeError( # pragma: no cover + "Type ({}, {}) is not a numpy type (operator '{}')".format( + a, type(a), self.__class__.__name__)) try: - return self._run(*args, **kwargs) + res = self._run(*args, **kwargs) except TypeError as e: raise TypeError( # pragma: no cover "Issues with types {} (operator {}).".format( ", ".join(str(type(_)) for _ in args), self.__class__.__name__)) from e + for ar in res: + a = ar.dtype + if not isinstance(a, numpy.dtype) and a not in { + numpy.int8, numpy.uint8, numpy.float16, numpy.float32, + numpy.float64, numpy.int32, numpy.int64, numpy.int16, + numpy.uint16, numpy.uint32, numpy.bool_, numpy.str_, + numpy.uint64, bool, str, }: + raise TypeError( # pragma: no cover + "Type ({}, {}) is not a numpy type (operator '{}')".format( + a, type(a), self.__class__.__name__)) + return res def switch_initializers_dtype(self, dtype_in=numpy.float32, dtype_out=numpy.float64): @@ -239,10 +260,11 @@ def infer_types(self, *args, **kwargs): "res must be tuple not {} (operator '{}')".format( type(res), self.__class__.__name__)) for a in res: - if a not in {numpy.int8, numpy.uint8, numpy.float16, numpy.float32, - numpy.float64, numpy.int32, numpy.int64, numpy.int16, - numpy.uint16, numpy.uint32, numpy.bool_, numpy.str_, - numpy.uint64, bool, str, }: + if not isinstance(a, numpy.dtype) and a not in { + numpy.int8, numpy.uint8, numpy.float16, numpy.float32, + numpy.float64, numpy.int32, numpy.int64, numpy.int16, + numpy.uint16, numpy.uint32, numpy.bool_, numpy.str_, + numpy.uint64, bool, str, }: raise TypeError( # pragma: no cover "Type ({}, {}) is not a numpy type (operator '{}')".format( a, type(a), self.__class__.__name__)) diff --git a/mlprodict/onnxrt/ops_cpu/_op_classifier_string.py b/mlprodict/onnxrt/ops_cpu/_op_classifier_string.py index 7410410bc..75f8793ef 100644 --- a/mlprodict/onnxrt/ops_cpu/_op_classifier_string.py +++ b/mlprodict/onnxrt/ops_cpu/_op_classifier_string.py @@ -29,7 +29,7 @@ def _post_process_label_attributes(self): dtype=numpy.int64)) self._classlabels_int64s_string = self.classlabels_strings # pylint: disable=E0203 self.classlabels_strings = numpy.empty( - shape=(0, ), dtype=numpy.str) + shape=(0, ), dtype=numpy.str_) else: self._classlabels_int64s_string = None diff --git a/mlprodict/onnxrt/ops_cpu/_op_helper.py b/mlprodict/onnxrt/ops_cpu/_op_helper.py index 8caab7a4c..389ac914e 100644 --- a/mlprodict/onnxrt/ops_cpu/_op_helper.py +++ b/mlprodict/onnxrt/ops_cpu/_op_helper.py @@ -35,11 +35,11 @@ def proto2dtype(proto_type): if proto_type == TensorProto.FLOAT: # pylint: disable=E1101 return numpy.float32 if proto_type == TensorProto.BOOL: # pylint: disable=E1101 - return numpy.bool + return numpy.bool_ if proto_type == TensorProto.DOUBLE: # pylint: disable=E1101 return numpy.float64 if proto_type == TensorProto.STRING: # pylint: disable=E1101 - return numpy.str + return numpy.str_ if proto_type == TensorProto.INT64: # pylint: disable=E1101 return numpy.int64 if proto_type == TensorProto.INT32: # pylint: disable=E1101 @@ -77,9 +77,9 @@ def dtype_name(dtype): return "int32" if dtype == numpy.int64: return "int64" - if dtype == numpy.str: + if dtype == numpy.str_: return "str" - if dtype == numpy.bool: + if dtype == numpy.bool_: return "bool" raise ValueError( "Unexpected dtype {}.".format(dtype)) diff --git a/mlprodict/onnxrt/ops_cpu/op_cast.py b/mlprodict/onnxrt/ops_cpu/op_cast.py index 3955201ba..d60496a07 100644 --- a/mlprodict/onnxrt/ops_cpu/op_cast.py +++ b/mlprodict/onnxrt/ops_cpu/op_cast.py @@ -39,9 +39,9 @@ def __init__(self, onnx_node, desc=None, **options): elif self.to == TensorProto.UINT64: # pylint: disable=E1101 self._dtype = numpy.uint64 elif self.to == TensorProto.BOOL: # pylint: disable=E1101 - self._dtype = numpy.bool + self._dtype = numpy.bool_ elif self.to == TensorProto.STRING: # pylint: disable=E1101 - self._dtype = numpy.str + self._dtype = numpy.str_ elif self.to == TensorProto.FLOAT16: # pylint: disable=E1101 self._dtype = numpy.float16 elif self.to == TensorProto.COMPLEX64: # pylint: disable=E1101 diff --git a/mlprodict/onnxrt/ops_cpu/op_constant.py b/mlprodict/onnxrt/ops_cpu/op_constant.py index 8becea757..2440738c4 100644 --- a/mlprodict/onnxrt/ops_cpu/op_constant.py +++ b/mlprodict/onnxrt/ops_cpu/op_constant.py @@ -10,6 +10,18 @@ from ..shape_object import ShapeObject +def _check_dtype(val): + a = val.dtype + if not isinstance(a, numpy.dtype) and a not in { + numpy.int8, numpy.uint8, numpy.float16, numpy.float32, + numpy.float64, numpy.int32, numpy.int64, numpy.int16, + numpy.uint16, numpy.uint32, numpy.bool_, numpy.str_, + numpy.uint64, bool, str, }: + raise TypeError( # pragma: no cover + "Type ({}, {}) is not a numpy type (operator 'Constant')".format( + a, type(a))) + + class Constant_9(OpRun): atts = {'value': numpy.array([0], dtype=numpy.float32)} @@ -19,6 +31,7 @@ def __init__(self, onnx_node, desc=None, **options): expected_attributes=Constant.atts, **options) self.cst = self.value + _check_dtype(self.cst) def _run(self): # pylint: disable=W0221 return (self.cst, ) @@ -45,6 +58,7 @@ def __init__(self, onnx_node, desc=None, **options): self.cst = self.sparse_value else: self.cst = self.value + _check_dtype(self.cst) def _run(self): # pylint: disable=W0221 return (self.cst, ) @@ -93,6 +107,7 @@ def __init__(self, onnx_node, desc=None, **options): else: raise AttributeError( "No constant is defined for operator 'Constant'.") + _check_dtype(self.cst) def _run(self): # pylint: disable=W0221 return (self.cst, ) diff --git a/mlprodict/onnxrt/ops_cpu/op_constant_of_shape.py b/mlprodict/onnxrt/ops_cpu/op_constant_of_shape.py index 0972da164..fddb5d9f0 100644 --- a/mlprodict/onnxrt/ops_cpu/op_constant_of_shape.py +++ b/mlprodict/onnxrt/ops_cpu/op_constant_of_shape.py @@ -21,7 +21,7 @@ def __init__(self, onnx_node, desc=None, **options): if isinstance(self.value, numpy.ndarray) else self.value) if not isinstance(self.cst, (float, numpy.float32, numpy.float64, - numpy.int64, numpy.int32, numpy.bool, + numpy.int64, numpy.int32, numpy.bool_, numpy.float16)): raise TypeError( # pragma: no cover "cst must be a real not {}".format(type(self.cst))) diff --git a/mlprodict/onnxrt/ops_cpu/op_dict_vectorizer.py b/mlprodict/onnxrt/ops_cpu/op_dict_vectorizer.py index e59526526..22cd7f2d9 100644 --- a/mlprodict/onnxrt/ops_cpu/op_dict_vectorizer.py +++ b/mlprodict/onnxrt/ops_cpu/op_dict_vectorizer.py @@ -13,7 +13,7 @@ class DictVectorizer(OpRun): atts = {'int64_vocabulary': numpy.empty(0, dtype=numpy.int64), - 'string_vocabulary': numpy.empty(0, dtype=numpy.str)} + 'string_vocabulary': numpy.empty(0, dtype=numpy.str_)} def __init__(self, onnx_node, desc=None, **options): OpRun.__init__(self, onnx_node, desc=desc, diff --git a/mlprodict/onnxrt/ops_cpu/op_label_encoder.py b/mlprodict/onnxrt/ops_cpu/op_label_encoder.py index e4d544dd1..bf0fc778a 100644 --- a/mlprodict/onnxrt/ops_cpu/op_label_encoder.py +++ b/mlprodict/onnxrt/ops_cpu/op_label_encoder.py @@ -15,10 +15,10 @@ class LabelEncoder(OpRun): 'default_string': b'', 'keys_floats': numpy.empty(0, dtype=numpy.float32), 'keys_int64s': numpy.empty(0, dtype=numpy.int64), - 'keys_strings': numpy.empty(0, dtype=numpy.str), + 'keys_strings': numpy.empty(0, dtype=numpy.str_), 'values_floats': numpy.empty(0, dtype=numpy.float32), 'values_int64s': numpy.empty(0, dtype=numpy.int64), - 'values_strings': numpy.empty(0, dtype=numpy.str), + 'values_strings': numpy.empty(0, dtype=numpy.str_), } def __init__(self, onnx_node, desc=None, **options): diff --git a/mlprodict/onnxrt/ops_cpu/op_one_hot_encoder.py b/mlprodict/onnxrt/ops_cpu/op_one_hot_encoder.py index db0c00f17..412d61811 100644 --- a/mlprodict/onnxrt/ops_cpu/op_one_hot_encoder.py +++ b/mlprodict/onnxrt/ops_cpu/op_one_hot_encoder.py @@ -17,7 +17,7 @@ class OneHotEncoder(OpRun): """ atts = {'cats_int64s': numpy.empty(0, dtype=numpy.int64), - 'cats_strings': numpy.empty(0, dtype=numpy.str), + 'cats_strings': numpy.empty(0, dtype=numpy.str_), 'zeros': 1, } diff --git a/mlprodict/onnxrt/ops_cpu/op_zipmap.py b/mlprodict/onnxrt/ops_cpu/op_zipmap.py index e8f823c0e..d56ad712c 100644 --- a/mlprodict/onnxrt/ops_cpu/op_zipmap.py +++ b/mlprodict/onnxrt/ops_cpu/op_zipmap.py @@ -145,6 +145,10 @@ def __init__(self, rev_keys, mat): self._rev_keys = rev_keys self._mat = mat + @property + def dtype(self): + return self._mat.dtype + def __len__(self): return self._mat.shape[0] diff --git a/mlprodict/onnxrt/shape_object.py b/mlprodict/onnxrt/shape_object.py index e95dcd1da..9d319b330 100644 --- a/mlprodict/onnxrt/shape_object.py +++ b/mlprodict/onnxrt/shape_object.py @@ -500,11 +500,11 @@ def __init__(self, shape, dtype=None, use_n1=False, name=None): elif self._dtype in (int, 'int', 'int64'): self._dtype = numpy.int64 elif self._dtype in (str, 'str', numpy.str_): - self._dtype = numpy.str + self._dtype = numpy.str_ elif (hasattr(self._dtype, 'type') and self._dtype.type is numpy.string_): pass elif self._dtype in (bool, 'bool', numpy.bool_): - self._dtype = numpy.bool + self._dtype = numpy.bool_ elif self._dtype in (object, numpy.object_): pass elif self._dtype in (numpy.int8, 'int8', ): @@ -521,7 +521,7 @@ def __init__(self, shape, dtype=None, use_n1=False, name=None): self._dtype = numpy.uint64 elif self._dtype not in { numpy.float32, numpy.float64, numpy.int32, numpy.int64, - numpy.str, numpy.bool, numpy.float16, None, + numpy.str_, numpy.bool_, numpy.float16, None, numpy.complex64, numpy.complex128, 'map'}: raise ValueError( # pragma: no cover diff --git a/mlprodict/testing/test_utils/utils_backend_common.py b/mlprodict/testing/test_utils/utils_backend_common.py index 8617ab188..65b416de4 100644 --- a/mlprodict/testing/test_utils/utils_backend_common.py +++ b/mlprodict/testing/test_utils/utils_backend_common.py @@ -177,7 +177,7 @@ def compare_outputs(expected, output, verbose=False, **kwargs): if len(output.shape) == 3 and output.shape[0] == 1 and len( expected.shape) == 2: output = output.reshape(output.shape[1:]) - if expected.dtype in (numpy.str, numpy.dtype(" Date: Thu, 1 Jul 2021 01:20:41 +0200 Subject: [PATCH 2/3] lint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: xavier dupré --- .../ut__skl2onnx/test_sklearn_pipeline.py | 854 ++++++------- .../ut_onnx_conv/test_onnx_conv_dataframe.py | 242 ++-- .../test_onnxrt_runtime_xgboost.py | 2 +- .../test_onnxrt_python_runtime_ml.py | 1 + .../ut_onnxrt/test_onnxrt_validate_type.py | 2 +- mlprodict/asv_benchmark/_create_asv_helper.py | 1081 +++++++++-------- mlprodict/cli/convert_validate.py | 2 +- mlprodict/npy/onnx_numpy_annotation.py | 2 +- .../onnx_tools/optim/onnx_optimisation.py | 85 +- .../optim/onnx_optimisation_identity.py | 239 ++-- .../optim/onnx_optimisation_redundant.py | 349 +++--- .../optim/onnx_optimisation_unused.py | 163 +-- mlprodict/onnxrt/onnx_inference.py | 3 +- mlprodict/onnxrt/ops_cpu/op_cdist.py | 2 +- mlprodict/onnxrt/ops_cpu/op_solve.py | 4 +- mlprodict/sklapi/onnx_transformer.py | 2 +- .../testing/einsum/einsum_impl_classes.py | 4 +- .../testing/test_utils/quantized_tensor.py | 2 +- mlprodict/tools/filename_helper.py | 2 +- 19 files changed, 1526 insertions(+), 1515 deletions(-) diff --git a/_unittests/ut__skl2onnx/test_sklearn_pipeline.py b/_unittests/ut__skl2onnx/test_sklearn_pipeline.py index 140e36ea9..a5f1121fd 100644 --- a/_unittests/ut__skl2onnx/test_sklearn_pipeline.py +++ b/_unittests/ut__skl2onnx/test_sklearn_pipeline.py @@ -1,426 +1,428 @@ -""" -@brief test tree node (time=3s) -""" -import unittest -import warnings -from urllib.error import HTTPError -from io import StringIO -import numpy -from numpy.testing import assert_almost_equal -import pandas -from sklearn import __version__ as sklearn_version -from sklearn import datasets -from sklearn.compose import ColumnTransformer -from sklearn.decomposition import PCA, TruncatedSVD -from sklearn.impute import SimpleImputer -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import train_test_split -from sklearn.pipeline import Pipeline, FeatureUnion -from sklearn.preprocessing import ( - OneHotEncoder, StandardScaler, MinMaxScaler) -from sklearn.utils._testing import ignore_warnings -from pyquickhelper.pycode import ExtTestCase -from skl2onnx import convert_sklearn -from skl2onnx.common.data_types import ( - FloatTensorType, Int64TensorType, StringTensorType) -from mlprodict.testing.test_utils import ( - dump_data_and_model, fit_classification_model) -from mlprodict.tools.ort_wrapper import InferenceSession - - -class PipeConcatenateInput: - def __init__(self, pipe): - self.pipe = pipe - - def transform(self, inp): - if isinstance(inp, (numpy.ndarray, pandas.DataFrame)): - return self.pipe.transform(inp) - if isinstance(inp, dict): - keys = list(sorted(inp.keys())) - dim = inp[keys[0]].shape[0], len(keys) - x2 = numpy.zeros(dim) - for i in range(x2.shape[1]): - x2[:, i] = inp[keys[i]].ravel() - res = self.pipe.transform(x2) - return res - raise TypeError( - "Unable to predict with type {0}".format(type(inp))) - - -class TestSklearnPipeline(ExtTestCase): - - def test_pipeline(self): - data = numpy.array([[0, 0], [0, 0], [1, 1], [1, 1]], - dtype=numpy.float32) - scaler = StandardScaler() - scaler.fit(data) - model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) - - model_onnx = convert_sklearn(model, "pipeline", - [("input", FloatTensorType([None, 2]))]) - self.assertTrue(model_onnx is not None) - dump_data_and_model(data, model, model_onnx, - basename="SklearnPipelineScaler") - - def test_combine_inputs(self): - data = numpy.array( - [[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], - dtype=numpy.float32) - scaler = StandardScaler() - scaler.fit(data) - model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) - - model_onnx = convert_sklearn( - model, - "pipeline", - [ - ("input1", FloatTensorType([None, 1])), - ("input2", FloatTensorType([None, 1])), - ], - ) - self.assertTrue( - len(model_onnx.graph.node[-1].output) == 1) # pylint: disable=E1101 - self.assertTrue(model_onnx is not None) - data = { - "input1": data[:, 0].reshape((-1, 1)), - "input2": data[:, 1].reshape((-1, 1)), - } - dump_data_and_model( - data, PipeConcatenateInput(model), - model_onnx, basename="SklearnPipelineScaler11") - - def test_combine_inputs_union_in_pipeline(self): - - data = numpy.array( - [[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], - dtype=numpy.float32) - model = Pipeline([ - ("scaler1", StandardScaler()), - ( - "union", - FeatureUnion([ - ("scaler2", StandardScaler()), - ("scaler3", MinMaxScaler()), - ]), - ), - ]) - model.fit(data) - model_onnx = convert_sklearn( - model, - "pipeline", - [ - ("input1", FloatTensorType([None, 1])), - ("input2", FloatTensorType([None, 1])), - ], - ) - self.assertTrue( - len(model_onnx.graph.node[-1].output) == 1) # pylint: disable=E1101 - self.assertTrue(model_onnx is not None) - data = { - "input1": data[:, 0].reshape((-1, 1)), - "input2": data[:, 1].reshape((-1, 1)), - } - dump_data_and_model( - data, PipeConcatenateInput(model), - model_onnx, basename="SklearnPipelineScaler11Union") - - def test_combine_inputs_floats_ints(self): - data = [[0, 0.0], [0, 0.0], [1, 1.0], [1, 1.0]] - scaler = StandardScaler() - scaler.fit(data) - model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) - - model_onnx = convert_sklearn( - model, - "pipeline", - [ - # First input decides the output type. - ("input2", FloatTensorType([None, 1])), - ("input1", Int64TensorType([None, 1])), - ], - ) - self.assertTrue( - len(model_onnx.graph.node[-1].output) == 1) # pylint: disable=E1101 - self.assertTrue(model_onnx is not None) - data = numpy.array(data) - data = { - "input1": data[:, 0].reshape((-1, 1)).astype(numpy.int64), - "input2": data[:, 1].reshape((-1, 1)).astype(numpy.float32), - } - dump_data_and_model( - data, PipeConcatenateInput(model), - model_onnx, basename="SklearnPipelineScalerMixed") - - @ignore_warnings(category=RuntimeWarning) - def test_pipeline_column_transformer(self): - - iris = datasets.load_iris() - X = iris.data[:, :3] - y = iris.target - X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) - X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1" - if x > 0.5 else "cat2") - X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3" - if x > 0.5 else "cat4") - y_train = y % 2 - numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] - categorical_features = [3, 4] # ["vcat", "vcat2"] - - classifier = LogisticRegression( - C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), - n_jobs=1, max_iter=10, solver="lbfgs", tol=1e-3) - - numeric_transformer = Pipeline(steps=[ - ("imputer", SimpleImputer(strategy="median")), - ("scaler", StandardScaler()), - ]) - - categorical_transformer = Pipeline(steps=[ - ( - "onehot", - OneHotEncoder(sparse=True, handle_unknown="ignore"), - ), - ( - "tsvd", - TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4), - ), - ]) - - preprocessor = ColumnTransformer(transformers=[ - ("num", numeric_transformer, numeric_features), - ("cat", categorical_transformer, categorical_features), - ]) - - model = Pipeline(steps=[("precprocessor", - preprocessor), ("classifier", classifier)]) - - model.fit(X_train, y_train) - initial_type = [ - ("numfeat", FloatTensorType([None, 3])), - ("strfeat", StringTensorType([None, 2])), - ] - - X_train = X_train[:11] - model_onnx = convert_sklearn(model, initial_types=initial_type) - - dump_data_and_model( - X_train, model, model_onnx, - basename="SklearnPipelineColumnTransformerPipeliner") - - def test_pipeline_column_transformer_titanic(self): - - # fit - titanic_url = ( - "https://raw.githubusercontent.com/amueller/" - "scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv") - try: - data = pandas.read_csv(titanic_url) - except HTTPError: - warnings.warn("Connectivity issue for '{}'.".format(titanic_url)) - return - X = data.drop("survived", axis=1) - y = data["survived"] - - # SimpleImputer on string is not available for string - # in ONNX-ML specifications. - # So we do it beforehand. - for cat in ["embarked", "sex", "pclass"]: - X[cat].fillna("missing", inplace=True) - - X_train, X_test, y_train, _ = train_test_split( - X, y, test_size=0.2) - - numeric_features = ["age", "fare"] - numeric_transformer = Pipeline(steps=[ - ("imputer", SimpleImputer(strategy="median")), - ("scaler", StandardScaler()), - ]) - - categorical_features = ["embarked", "sex", "pclass"] - categorical_transformer = Pipeline(steps=[ - # --- SimpleImputer on string is not available - # for string in ONNX-ML specifications. - # ('imputer', - # SimpleImputer(strategy='constant', fill_value='missing')), - ("onehot", OneHotEncoder(handle_unknown="ignore")) - ]) - - preprocessor = ColumnTransformer(transformers=[ - ("num", numeric_transformer, numeric_features), - ("cat", categorical_transformer, categorical_features), - ]) - - clf = Pipeline(steps=[ - ("preprocessor", preprocessor), - # ("classifier", LogisticRegression(solver="lbfgs")), - ]) - - # inputs - - def convert_dataframe_schema(df, drop=None): - inputs = [] - for k, v in zip(df.columns, df.dtypes): - if drop is not None and k in drop: - continue - if v == 'int64': - t = Int64TensorType([None, 1]) - elif v == "float64": - t = FloatTensorType([None, 1]) - else: - t = StringTensorType([None, 1]) - inputs.append((k, t)) - return inputs - - to_drop = { - "parch", - "sibsp", - "cabin", - "ticket", - "name", - "body", - "home.dest", - "boat", - } - - X_train = X_train.copy() - X_test = X_test.copy() - X_train['pclass'] = X_train['pclass'].astype(numpy.int64) - X_test['pclass'] = X_test['pclass'].astype(numpy.int64) - X_train = X_train.drop(to_drop, axis=1) - X_test = X_test.drop(to_drop, axis=1) - - clf.fit(X_train, y_train) - inputs = convert_dataframe_schema(X_train, to_drop) - model_onnx = convert_sklearn(clf, "pipeline_titanic", inputs) - - data = X_test[:5] - pred = clf.transform(data) - data_types = { - 'pclass': numpy.int64, - 'age': numpy.float32, - 'sex': numpy.str_, - 'fare': numpy.float32, - 'embarked': numpy.str_, - } - inputs = {k: data[k].values.astype(data_types[k]).reshape(-1, 1) - for k in data.columns} - sess = InferenceSession(model_onnx.SerializeToString()) - run = sess.run(None, inputs) - got = run[-1] - assert_almost_equal(pred, got, decimal=5) - - def test_column_transformer_weights(self): - model, X = fit_classification_model( - ColumnTransformer( - [('pca', PCA(n_components=5), slice(0, 10)), - ('svd', TruncatedSVD(n_components=5), slice(10, 100))], - transformer_weights={'pca': 2, 'svd': 3}), 3, n_features=100) - model_onnx = convert_sklearn( - model, - "column transformer weights", - [("input", FloatTensorType([None, X.shape[1]]))]) - self.assertIsNotNone(model_onnx) - dump_data_and_model( - X, model, model_onnx, - basename="SklearnColumnTransformerWeights-Dec4") - - def test_column_transformer_drop(self): - model, X = fit_classification_model( - ColumnTransformer( - [('pca', PCA(n_components=5), slice(0, 10)), - ('svd', TruncatedSVD(n_components=5), slice(80, 100))], - remainder='drop'), 3, n_features=100) - model_onnx = convert_sklearn( - model, - "column transformer drop", - [("input", FloatTensorType([None, X.shape[1]]))]) - self.assertIsNotNone(model_onnx) - dump_data_and_model( - X, model, model_onnx, - basename="SklearnColumnTransformerDrop") - - def test_column_transformer_passthrough(self): - model, X = fit_classification_model( - ColumnTransformer( - [('pca', PCA(n_components=5), slice(0, 10)), - ('svd', TruncatedSVD(n_components=5), slice(80, 100))], - transformer_weights={'pca': 2, 'svd': 3}, - remainder='passthrough'), 3, n_features=100) - model_onnx = convert_sklearn( - model, "column transformer passthrough", - [("input", FloatTensorType([None, X.shape[1]]))]) - self.assertIsNotNone(model_onnx) - dump_data_and_model( - X, model, model_onnx, - basename="SklearnColumnTransformerPassthrough") - - def test_column_transformer_passthrough_no_weights(self): - model, X = fit_classification_model( - ColumnTransformer( - [('pca', PCA(n_components=5), slice(0, 10)), - ('svd', TruncatedSVD(n_components=5), slice(70, 80))], - remainder='passthrough'), 3, n_features=100) - model_onnx = convert_sklearn( - model, "column transformer passthrough", - [("input", FloatTensorType([None, X.shape[1]]))]) - self.assertIsNotNone(model_onnx) - dump_data_and_model( - X, model, model_onnx, - basename="SklearnColumnTransformerPassthroughNoWeights") - - def test_pipeline_dataframe(self): - text = """ - fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color - 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red - 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red - 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red - 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red - """.replace(" ", "") - X_train = pandas.read_csv(StringIO(text)) - for c in X_train.columns: - if c != 'color': - X_train[c] = X_train[c].astype(numpy.float32) - numeric_features = [c for c in X_train if c != 'color'] - - pipe = Pipeline([ - ("prep", ColumnTransformer([ - ("color", Pipeline([ - ('one', OneHotEncoder()), - ('select', ColumnTransformer( - [('sel1', 'passthrough', [0])])) - ]), ['color']), - ("others", "passthrough", numeric_features) - ])), - ]) - - init_types = [ - ('fixed_acidity', FloatTensorType(shape=[None, 1])), - ('volatile_acidity', FloatTensorType(shape=[None, 1])), - ('citric_acid', FloatTensorType(shape=[None, 1])), - ('residual_sugar', FloatTensorType(shape=[None, 1])), - ('chlorides', FloatTensorType(shape=[None, 1])), - ('free_sulfur_dioxide', FloatTensorType(shape=[None, 1])), - ('total_sulfur_dioxide', FloatTensorType(shape=[None, 1])), - ('density', FloatTensorType(shape=[None, 1])), - ('pH', FloatTensorType(shape=[None, 1])), - ('sulphates', FloatTensorType(shape=[None, 1])), - ('alcohol', FloatTensorType(shape=[None, 1])), - ('quality', FloatTensorType(shape=[None, 1])), - ('color', StringTensorType(shape=[None, 1])) - ] - - pipe.fit(X_train) - model_onnx = convert_sklearn(pipe, initial_types=init_types) - oinf = InferenceSession(model_onnx.SerializeToString()) - - pred = pipe.transform(X_train) - inputs = {c: X_train[c].values for c in X_train.columns} - inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()} - onxp = oinf.run(None, inputs) - got = onxp[0] - assert_almost_equal(pred, got) - - -if __name__ == "__main__": - # TestSklearnPipeline().test_combine_inputs_floats_ints() - unittest.main() +""" +@brief test tree node (time=3s) +""" +import unittest +import warnings +from urllib.error import HTTPError +from io import StringIO +import numpy +from numpy.testing import assert_almost_equal +import pandas +from sklearn import __version__ as sklearn_version +from sklearn import datasets +from sklearn.compose import ColumnTransformer +from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.impute import SimpleImputer +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +from sklearn.pipeline import Pipeline, FeatureUnion +from sklearn.preprocessing import ( + OneHotEncoder, StandardScaler, MinMaxScaler) +from sklearn.utils._testing import ignore_warnings +from pyquickhelper.pycode import ExtTestCase +from skl2onnx import convert_sklearn +from skl2onnx.common.data_types import ( + FloatTensorType, Int64TensorType, StringTensorType) +from mlprodict.testing.test_utils import ( + dump_data_and_model, fit_classification_model) +from mlprodict.tools.ort_wrapper import InferenceSession + + +class PipeConcatenateInput: + def __init__(self, pipe): + self.pipe = pipe + + def transform(self, inp): + if isinstance(inp, (numpy.ndarray, pandas.DataFrame)): + return self.pipe.transform(inp) + if isinstance(inp, dict): + keys = list(sorted(inp.keys())) + dim = inp[keys[0]].shape[0], len(keys) + x2 = numpy.zeros(dim) + for i in range(x2.shape[1]): + x2[:, i] = inp[keys[i]].ravel() + res = self.pipe.transform(x2) + return res + raise TypeError( + "Unable to predict with type {0}".format(type(inp))) + + +class TestSklearnPipeline(ExtTestCase): + + def test_pipeline(self): + data = numpy.array([[0, 0], [0, 0], [1, 1], [1, 1]], + dtype=numpy.float32) + scaler = StandardScaler() + scaler.fit(data) + model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) + + model_onnx = convert_sklearn(model, "pipeline", + [("input", FloatTensorType([None, 2]))]) + self.assertTrue(model_onnx is not None) + dump_data_and_model(data, model, model_onnx, + basename="SklearnPipelineScaler") + + def test_combine_inputs(self): + data = numpy.array( + [[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], + dtype=numpy.float32) + scaler = StandardScaler() + scaler.fit(data) + model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) + + model_onnx = convert_sklearn( + model, + "pipeline", + [ + ("input1", FloatTensorType([None, 1])), + ("input2", FloatTensorType([None, 1])), + ], + ) + self.assertTrue( + len(model_onnx.graph.node[-1].output) == 1) # pylint: disable=E1101 + self.assertTrue(model_onnx is not None) + data = { + "input1": data[:, 0].reshape((-1, 1)), + "input2": data[:, 1].reshape((-1, 1)), + } + dump_data_and_model( + data, PipeConcatenateInput(model), + model_onnx, basename="SklearnPipelineScaler11") + + def test_combine_inputs_union_in_pipeline(self): + + data = numpy.array( + [[0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0]], + dtype=numpy.float32) + model = Pipeline([ + ("scaler1", StandardScaler()), + ( + "union", + FeatureUnion([ + ("scaler2", StandardScaler()), + ("scaler3", MinMaxScaler()), + ]), + ), + ]) + model.fit(data) + model_onnx = convert_sklearn( + model, + "pipeline", + [ + ("input1", FloatTensorType([None, 1])), + ("input2", FloatTensorType([None, 1])), + ], + ) + self.assertTrue( + len(model_onnx.graph.node[-1].output) == 1) # pylint: disable=E1101 + self.assertTrue(model_onnx is not None) + data = { + "input1": data[:, 0].reshape((-1, 1)), + "input2": data[:, 1].reshape((-1, 1)), + } + dump_data_and_model( + data, PipeConcatenateInput(model), + model_onnx, basename="SklearnPipelineScaler11Union") + + def test_combine_inputs_floats_ints(self): + data = [[0, 0.0], [0, 0.0], [1, 1.0], [1, 1.0]] + scaler = StandardScaler() + scaler.fit(data) + model = Pipeline([("scaler1", scaler), ("scaler2", scaler)]) + + model_onnx = convert_sklearn( + model, + "pipeline", + [ + # First input decides the output type. + ("input2", FloatTensorType([None, 1])), + ("input1", Int64TensorType([None, 1])), + ], + ) + self.assertTrue( + len(model_onnx.graph.node[-1].output) == 1) # pylint: disable=E1101 + self.assertTrue(model_onnx is not None) + data = numpy.array(data) + data = { + "input1": data[:, 0].reshape((-1, 1)).astype(numpy.int64), + "input2": data[:, 1].reshape((-1, 1)).astype(numpy.float32), + } + dump_data_and_model( + data, PipeConcatenateInput(model), + model_onnx, basename="SklearnPipelineScalerMixed") + + @ignore_warnings(category=RuntimeWarning) + def test_pipeline_column_transformer(self): + + iris = datasets.load_iris() + X = iris.data[:, :3] + y = iris.target + X_train = pandas.DataFrame(X, columns=["vA", "vB", "vC"]) + X_train["vcat"] = X_train["vA"].apply(lambda x: "cat1" + if x > 0.5 else "cat2") + X_train["vcat2"] = X_train["vB"].apply(lambda x: "cat3" + if x > 0.5 else "cat4") + y_train = y % 2 + numeric_features = [0, 1, 2] # ["vA", "vB", "vC"] + categorical_features = [3, 4] # ["vcat", "vcat2"] + + classifier = LogisticRegression( + C=0.01, class_weight=dict(zip([False, True], [0.2, 0.8])), + n_jobs=1, max_iter=10, solver="lbfgs", tol=1e-3) + + numeric_transformer = Pipeline(steps=[ + ("imputer", SimpleImputer(strategy="median")), + ("scaler", StandardScaler()), + ]) + + categorical_transformer = Pipeline(steps=[ + ( + "onehot", + OneHotEncoder(sparse=True, handle_unknown="ignore"), + ), + ( + "tsvd", + TruncatedSVD(n_components=1, algorithm="arpack", tol=1e-4), + ), + ]) + + preprocessor = ColumnTransformer(transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ]) + + model = Pipeline(steps=[("precprocessor", + preprocessor), ("classifier", classifier)]) + + model.fit(X_train, y_train) + initial_type = [ + ("numfeat", FloatTensorType([None, 3])), + ("strfeat", StringTensorType([None, 2])), + ] + + X_train = X_train[:11] + model_onnx = convert_sklearn(model, initial_types=initial_type) + + dump_data_and_model( + X_train, model, model_onnx, + basename="SklearnPipelineColumnTransformerPipeliner") + + def test_pipeline_column_transformer_titanic(self): + + # fit + titanic_url = ( + "https://raw.githubusercontent.com/amueller/" + "scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv") + try: + data = pandas.read_csv(titanic_url) + except HTTPError: + warnings.warn("Connectivity issue for '{}'.".format(titanic_url)) + return + X = data.drop("survived", axis=1) + y = data["survived"] # pylint: disable=E1136 + + # SimpleImputer on string is not available for string + # in ONNX-ML specifications. + # So we do it beforehand. + for cat in ["embarked", "sex", "pclass"]: + X[cat].fillna("missing", inplace=True) + + X_train, X_test, y_train, _ = train_test_split( + X, y, test_size=0.2) + + numeric_features = ["age", "fare"] + numeric_transformer = Pipeline(steps=[ + ("imputer", SimpleImputer(strategy="median")), + ("scaler", StandardScaler()), + ]) + + categorical_features = ["embarked", "sex", "pclass"] + categorical_transformer = Pipeline(steps=[ + # --- SimpleImputer on string is not available + # for string in ONNX-ML specifications. + # ('imputer', + # SimpleImputer(strategy='constant', fill_value='missing')), + ("onehot", OneHotEncoder(handle_unknown="ignore")) + ]) + + preprocessor = ColumnTransformer(transformers=[ + ("num", numeric_transformer, numeric_features), + ("cat", categorical_transformer, categorical_features), + ]) + + clf = Pipeline(steps=[ + ("preprocessor", preprocessor), + # ("classifier", LogisticRegression(solver="lbfgs")), + ]) + + # inputs + + def convert_dataframe_schema(df, drop=None): + inputs = [] + for k, v in zip(df.columns, df.dtypes): + if drop is not None and k in drop: + continue + if v == 'int64': + t = Int64TensorType([None, 1]) + elif v == "float64": + t = FloatTensorType([None, 1]) + else: + t = StringTensorType([None, 1]) + inputs.append((k, t)) + return inputs + + to_drop = { + "parch", + "sibsp", + "cabin", + "ticket", + "name", + "body", + "home.dest", + "boat", + } + + X_train = X_train.copy() + X_test = X_test.copy() + X_train['pclass'] = X_train['pclass'].astype(numpy.int64) + X_test['pclass'] = X_test['pclass'].astype(numpy.int64) + X_train = X_train.drop(to_drop, axis=1) + X_test = X_test.drop(to_drop, axis=1) + + clf.fit(X_train, y_train) + inputs = convert_dataframe_schema(X_train, to_drop) + model_onnx = convert_sklearn(clf, "pipeline_titanic", inputs) + + data = X_test[:5] + pred = clf.transform(data) + data_types = { + 'pclass': numpy.int64, + 'age': numpy.float32, + 'sex': numpy.str_, + 'fare': numpy.float32, + 'embarked': numpy.str_, + } + inputs = {k: data[k].values.astype(data_types[k]).reshape(-1, 1) + for k in data.columns} + sess = InferenceSession(model_onnx.SerializeToString()) + run = sess.run(None, inputs) + got = run[-1] + assert_almost_equal(pred, got, decimal=5) + + def test_column_transformer_weights(self): + model, X = fit_classification_model( + ColumnTransformer( + [('pca', PCA(n_components=5), slice(0, 10)), + ('svd', TruncatedSVD(n_components=5), slice(10, 100))], + transformer_weights={'pca': 2, 'svd': 3}), 3, n_features=100) + model_onnx = convert_sklearn( + model, + "column transformer weights", + [("input", FloatTensorType([None, X.shape[1]]))]) + self.assertIsNotNone(model_onnx) + dump_data_and_model( + X, model, model_onnx, + basename="SklearnColumnTransformerWeights-Dec4") + + def test_column_transformer_drop(self): + model, X = fit_classification_model( + ColumnTransformer( + [('pca', PCA(n_components=5), slice(0, 10)), + ('svd', TruncatedSVD(n_components=5), slice(80, 100))], + remainder='drop'), 3, n_features=100) + model_onnx = convert_sklearn( + model, + "column transformer drop", + [("input", FloatTensorType([None, X.shape[1]]))]) + self.assertIsNotNone(model_onnx) + dump_data_and_model( + X, model, model_onnx, + basename="SklearnColumnTransformerDrop") + + def test_column_transformer_passthrough(self): + model, X = fit_classification_model( + ColumnTransformer( + [('pca', PCA(n_components=5), slice(0, 10)), + ('svd', TruncatedSVD(n_components=5), slice(80, 100))], + transformer_weights={'pca': 2, 'svd': 3}, + remainder='passthrough'), 3, n_features=100) + model_onnx = convert_sklearn( + model, "column transformer passthrough", + [("input", FloatTensorType([None, X.shape[1]]))]) + self.assertIsNotNone(model_onnx) + dump_data_and_model( + X, model, model_onnx, + basename="SklearnColumnTransformerPassthrough") + + def test_column_transformer_passthrough_no_weights(self): + model, X = fit_classification_model( + ColumnTransformer( + [('pca', PCA(n_components=5), slice(0, 10)), + ('svd', TruncatedSVD(n_components=5), slice(70, 80))], + remainder='passthrough'), 3, n_features=100) + model_onnx = convert_sklearn( + model, "column transformer passthrough", + [("input", FloatTensorType([None, X.shape[1]]))]) + self.assertIsNotNone(model_onnx) + dump_data_and_model( + X, model, model_onnx, + basename="SklearnColumnTransformerPassthroughNoWeights") + + def test_pipeline_dataframe(self): + text = """ + fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color + 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red + 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red + 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red + 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red + """.replace(" ", "") + X_train = pandas.read_csv(StringIO(text)) + for c in X_train.columns: + if c != 'color': + X_train[c] = X_train[c].astype( # pylint: disable=E1136,E1137 + numpy.float32) + numeric_features = [c for c in X_train if c != 'color'] + + pipe = Pipeline([ + ("prep", ColumnTransformer([ + ("color", Pipeline([ + ('one', OneHotEncoder()), + ('select', ColumnTransformer( + [('sel1', 'passthrough', [0])])) + ]), ['color']), + ("others", "passthrough", numeric_features) + ])), + ]) + + init_types = [ + ('fixed_acidity', FloatTensorType(shape=[None, 1])), + ('volatile_acidity', FloatTensorType(shape=[None, 1])), + ('citric_acid', FloatTensorType(shape=[None, 1])), + ('residual_sugar', FloatTensorType(shape=[None, 1])), + ('chlorides', FloatTensorType(shape=[None, 1])), + ('free_sulfur_dioxide', FloatTensorType(shape=[None, 1])), + ('total_sulfur_dioxide', FloatTensorType(shape=[None, 1])), + ('density', FloatTensorType(shape=[None, 1])), + ('pH', FloatTensorType(shape=[None, 1])), + ('sulphates', FloatTensorType(shape=[None, 1])), + ('alcohol', FloatTensorType(shape=[None, 1])), + ('quality', FloatTensorType(shape=[None, 1])), + ('color', StringTensorType(shape=[None, 1])) + ] + + pipe.fit(X_train) + model_onnx = convert_sklearn(pipe, initial_types=init_types) + oinf = InferenceSession(model_onnx.SerializeToString()) + + pred = pipe.transform(X_train) + inputs = { + c: X_train[c].values for c in X_train.columns} # pylint: disable=E1101,E1136 + inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()} + onxp = oinf.run(None, inputs) + got = onxp[0] + assert_almost_equal(pred, got) + + +if __name__ == "__main__": + # TestSklearnPipeline().test_combine_inputs_floats_ints() + unittest.main() diff --git a/_unittests/ut_onnx_conv/test_onnx_conv_dataframe.py b/_unittests/ut_onnx_conv/test_onnx_conv_dataframe.py index 0d64e5668..6023abdb5 100644 --- a/_unittests/ut_onnx_conv/test_onnx_conv_dataframe.py +++ b/_unittests/ut_onnx_conv/test_onnx_conv_dataframe.py @@ -1,120 +1,122 @@ -""" -@brief test log(time=2s) -""" -import unittest -from logging import getLogger -from io import StringIO -import numpy -import pandas -from pyquickhelper.pycode import ExtTestCase -from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder -from sklearn.pipeline import Pipeline -from sklearn.compose import ColumnTransformer -from skl2onnx.common.data_types import Int64TensorType -from mlprodict.onnx_conv import ( - to_onnx, guess_schema_from_data, get_inputs_from_data) -from mlprodict.onnxrt import OnnxInference - - -class TestOnnxConvDataframe(ExtTestCase): - - def setUp(self): - logger = getLogger('skl2onnx') - logger.disabled = True - - def test_pipeline_dataframe_case1(self): - self.case_test_pipeline_dataframe(1) - - def test_pipeline_dataframe_case2(self): - self.case_test_pipeline_dataframe(2) - - def test_pipeline_dataframe_case3(self): - self.case_test_pipeline_dataframe(3) - - def test_pipeline_dataframe_case4(self): - self.case_test_pipeline_dataframe(4) - - def test_pipeline_dataframe_case4_cat(self): - self.case_test_pipeline_dataframe(4, cat=True) - - def case_test_pipeline_dataframe(self, case, cat=False): - text = """ - fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color - 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red - 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red - 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red - 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,white - """.replace(" ", "") - X_train = pandas.read_csv(StringIO(text)) - for c in X_train.columns: - if c != 'color': - X_train[c] = X_train[c].astype(numpy.float32) - numeric_features = [c for c in X_train if c != 'color'] - - if case == 1: - pipe = Pipeline([ - ("prep", ColumnTransformer([ - ("color", Pipeline([ - ('one', OneHotEncoder(sparse=False)), - ]), ['color']), - ("others", "passthrough", numeric_features) - ])), - ]) - elif case == 2: - pipe = Pipeline([ - ("prep", ColumnTransformer([ - ("color", Pipeline([ - ('one', OneHotEncoder(sparse=False)), - ('select', ColumnTransformer( - [('sel1', "passthrough", [0])])) - ]), ['color']), - ("others", "passthrough", numeric_features) - ])), - ]) - elif case == 3: - pipe = Pipeline([ - ("prep", ColumnTransformer([ - ("colorord", OrdinalEncoder(), ['color']), - ("others", "passthrough", numeric_features) - ])), - ]) - elif case == 4: - pipe = Pipeline([ - ("prep", ColumnTransformer([ - ("color", Pipeline([ - ('one', OneHotEncoder(sparse=False)), - ('select', ColumnTransformer( - [('sel1', "passthrough", [0])])) - ]), ['color']), - ("colorord", OrdinalEncoder(), ['color']), - ("others", "passthrough", numeric_features) - ])), - ]) - else: - raise NotImplementedError() - - if cat: - X_train['color'] = X_train['color'].astype('category') - schema = guess_schema_from_data(X_train) - if isinstance(schema[-1][-1], Int64TensorType): - raise AssertionError( - "Issue with type of last column %r: %r." % ( - schema[-1], X_train.dtypes[-1])) - - pipe.fit(X_train) - model_onnx = to_onnx(pipe, X_train) - try: - oinf = OnnxInference(model_onnx) - except RuntimeError as e: - raise RuntimeError("Fails for case={}\n{}".format( - case, e)) from e - - pred = pipe.transform(X_train) - inputs = get_inputs_from_data(X_train) - onxp = oinf.run(inputs) - got = onxp['transformed_column'] - self.assertEqualArray(pred, got) - - -if __name__ == "__main__": - unittest.main() +""" +@brief test log(time=2s) +""" +import unittest +from logging import getLogger +from io import StringIO +import numpy +import pandas +from pyquickhelper.pycode import ExtTestCase +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder +from sklearn.pipeline import Pipeline +from sklearn.compose import ColumnTransformer +from skl2onnx.common.data_types import Int64TensorType +from mlprodict.onnx_conv import ( + to_onnx, guess_schema_from_data, get_inputs_from_data) +from mlprodict.onnxrt import OnnxInference + + +class TestOnnxConvDataframe(ExtTestCase): + + def setUp(self): + logger = getLogger('skl2onnx') + logger.disabled = True + + def test_pipeline_dataframe_case1(self): + self.case_test_pipeline_dataframe(1) + + def test_pipeline_dataframe_case2(self): + self.case_test_pipeline_dataframe(2) + + def test_pipeline_dataframe_case3(self): + self.case_test_pipeline_dataframe(3) + + def test_pipeline_dataframe_case4(self): + self.case_test_pipeline_dataframe(4) + + def test_pipeline_dataframe_case4_cat(self): + self.case_test_pipeline_dataframe(4, cat=True) + + def case_test_pipeline_dataframe(self, case, cat=False): + text = """ + fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color + 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red + 7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red + 7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red + 11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,white + """.replace(" ", "") + X_train = pandas.read_csv(StringIO(text)) + for c in X_train.columns: + if c != 'color': + X_train[c] = X_train[c].astype( # pylint: disable=E1136,E1137 + numpy.float32) + numeric_features = [c for c in X_train if c != 'color'] + + if case == 1: + pipe = Pipeline([ + ("prep", ColumnTransformer([ + ("color", Pipeline([ + ('one', OneHotEncoder(sparse=False)), + ]), ['color']), + ("others", "passthrough", numeric_features) + ])), + ]) + elif case == 2: + pipe = Pipeline([ + ("prep", ColumnTransformer([ + ("color", Pipeline([ + ('one', OneHotEncoder(sparse=False)), + ('select', ColumnTransformer( + [('sel1', "passthrough", [0])])) + ]), ['color']), + ("others", "passthrough", numeric_features) + ])), + ]) + elif case == 3: + pipe = Pipeline([ + ("prep", ColumnTransformer([ + ("colorord", OrdinalEncoder(), ['color']), + ("others", "passthrough", numeric_features) + ])), + ]) + elif case == 4: + pipe = Pipeline([ + ("prep", ColumnTransformer([ + ("color", Pipeline([ + ('one', OneHotEncoder(sparse=False)), + ('select', ColumnTransformer( + [('sel1', "passthrough", [0])])) + ]), ['color']), + ("colorord", OrdinalEncoder(), ['color']), + ("others", "passthrough", numeric_features) + ])), + ]) + else: + raise NotImplementedError() + + if cat: + X_train['color'] = X_train['color'].astype( # pylint: disable=E1136,E1137 + 'category') + schema = guess_schema_from_data(X_train) + if isinstance(schema[-1][-1], Int64TensorType): + raise AssertionError( + "Issue with type of last column %r: %r." % ( + schema[-1], X_train.dtypes[-1])) # pylint: disable=E1101 + + pipe.fit(X_train) + model_onnx = to_onnx(pipe, X_train) + try: + oinf = OnnxInference(model_onnx) + except RuntimeError as e: + raise RuntimeError("Fails for case={}\n{}".format( + case, e)) from e + + pred = pipe.transform(X_train) + inputs = get_inputs_from_data(X_train) + onxp = oinf.run(inputs) + got = onxp['transformed_column'] + self.assertEqualArray(pred, got) + + +if __name__ == "__main__": + unittest.main() diff --git a/_unittests/ut_onnx_conv/test_onnxrt_runtime_xgboost.py b/_unittests/ut_onnx_conv/test_onnxrt_runtime_xgboost.py index 104fa1e19..c0446daed 100644 --- a/_unittests/ut_onnx_conv/test_onnxrt_runtime_xgboost.py +++ b/_unittests/ut_onnx_conv/test_onnxrt_runtime_xgboost.py @@ -61,7 +61,7 @@ def setUp(self): @skipif_circleci('stuck') def test_onnxrt_python_xgbregressor(self): nb_tests = 0 - for objective in obj_classes: + for objective in obj_classes: # pylint: disable=C0206 for n_estimators in [1, 2]: with self.subTest(objective=objective, n_estimators=n_estimators): probs = [] diff --git a/_unittests/ut_onnxrt/test_onnxrt_python_runtime_ml.py b/_unittests/ut_onnxrt/test_onnxrt_python_runtime_ml.py index 6bbf79514..3fda34185 100644 --- a/_unittests/ut_onnxrt/test_onnxrt_python_runtime_ml.py +++ b/_unittests/ut_onnxrt/test_onnxrt_python_runtime_ml.py @@ -2,6 +2,7 @@ @brief test log(time=2s) """ import unittest +import warnings from logging import getLogger import numpy import pandas diff --git a/_unittests/ut_onnxrt/test_onnxrt_validate_type.py b/_unittests/ut_onnxrt/test_onnxrt_validate_type.py index 9ab88a1f3..f04de8e64 100644 --- a/_unittests/ut_onnxrt/test_onnxrt_validate_type.py +++ b/_unittests/ut_onnxrt/test_onnxrt_validate_type.py @@ -56,7 +56,7 @@ def filter_scenario(m, p, o, e, e2): logger = getLogger('skl2onnx') logger.disabled = True - subname = str(dtype).split('.')[-1].strip("'><") + subname = str(dtype).rsplit('.', maxsplit=1)[-1].strip("'><") temp = get_temp_folder( __file__, "temp_validate_sklearn_operators_" + subname) nb = 60 diff --git a/mlprodict/asv_benchmark/_create_asv_helper.py b/mlprodict/asv_benchmark/_create_asv_helper.py index d989d0a28..27a8a511d 100644 --- a/mlprodict/asv_benchmark/_create_asv_helper.py +++ b/mlprodict/asv_benchmark/_create_asv_helper.py @@ -1,540 +1,541 @@ -""" -@file Functions to creates a benchmark based on :epkg:`asv` -for many regressors and classifiers. -""" -import os -import textwrap -import hashlib -try: - from ..onnx_tools.optim.sklearn_helper import set_n_jobs -except (ValueError, ImportError): # pragma: no cover - from mlprodict.onnx_tools.optim.sklearn_helper import set_n_jobs - -# exec function does not import models but potentially -# requires all specific models used to defines scenarios -try: - from ..onnxrt.validate.validate_scenarios import * # pylint: disable=W0614,W0401 -except (ValueError, ImportError): # pragma: no cover - # Skips this step if used in a benchmark. - pass - - -default_asv_conf = { - "version": 1, - "project": "mlprodict", - "project_url": "http://www.xavierdupre.fr/app/mlprodict/helpsphinx/index.html", - "repo": "https://github.com/sdpython/mlprodict.git", - "repo_subdir": "", - "install_command": ["python -mpip install {wheel_file}"], - "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], - "build_command": [ - "python setup.py build", - "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" - ], - "branches": ["master"], - "environment_type": "virtualenv", - "install_timeout": 600, - "show_commit_url": "https://github.com/sdpython/mlprodict/commit/", - # "pythons": ["__PYVER__"], - "matrix": { - "cython": [], - "jinja2": [], - "joblib": [], - "lightgbm": [], - "mlinsights": [], - "numpy": [], - "onnx": ["http://localhost:8067/simple/"], - "onnxruntime": ["http://localhost:8067/simple/"], - "pandas": [], - "Pillow": [], - "pybind11": [], - "pyquickhelper": [], - "scipy": [], - # "git+https://github.com/xadupre/onnxconverter-common.git@jenkins"], - "onnxconverter-common": ["http://localhost:8067/simple/"], - # "git+https://github.com/xadupre/sklearn-onnx.git@jenkins"], - "skl2onnx": ["http://localhost:8067/simple/"], - # "git+https://github.com/scikit-learn/scikit-learn.git"], - "scikit-learn": ["http://localhost:8067/simple/"], - "xgboost": [], - }, - "benchmark_dir": "benches", - "env_dir": "env", - "results_dir": "results", - "html_dir": "html", -} - -flask_helper = """ -''' -Local ASV files do no properly render in a browser, -it needs to be served through a server. -''' -import os.path -from flask import Flask, Response - -app = Flask(__name__) -app.config.from_object(__name__) - - -def root_dir(): - return os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "html") - - -def get_file(filename): # pragma: no cover - try: - src = os.path.join(root_dir(), filename) - with open(src, "r", encoding="utf-8", errors="ignore") as f: - return f.read() - except IOError as exc: - return str(exc) - - -@app.route('/', methods=['GET']) -def mainpage(): - content = get_file('index.html') - return Response(content, mimetype="text/html") - - -@app.route('/', defaults={'path': ''}) -@app.route('/') -def get_resource(path): # pragma: no cover - mimetypes = { - ".css": "text/css", - ".html": "text/html", - ".js": "application/javascript", - } - complete_path = os.path.join(root_dir(), path) - ext = os.path.splitext(path)[1] - mimetype = mimetypes.get(ext, "text/html") - content = get_file(complete_path) - return Response(content, mimetype=mimetype) - - -if __name__ == '__main__': # pragma: no cover - app.run( # ssl_context=('cert.pem', 'key.pem'), - port=8877, - # host="", - ) -""" - -pyspy_template = """ -import sys -sys.path.append(r"__PATH__") -from __PYFOLD__ import __CLASSNAME__ -import time -from datetime import datetime - - -def start(): - cl = __CLASSNAME__() - cl.setup_cache() - return cl - - -def profile0(iter, cl, runtime, N, nf, opset, dtype, optim): - begin = time.perf_counter() - for i in range(0, 100): - cl.time_predict(runtime, N, nf, opset, dtype, optim) - duration = time.perf_counter() - begin - iter = max(100, int(25 / duration * 100)) # 25 seconds - return iter - - -def setup_profile0(iter, cl, runtime, N, nf, opset, dtype, optim): - cl.setup(runtime, N, nf, opset, dtype, optim) - return profile0(iter, cl, runtime, N, nf, opset, dtype, optim) - - -def profile(iter, cl, runtime, N, nf, opset, dtype, optim): - for i in range(iter): - cl.time_predict(runtime, N, nf, opset, dtype, optim) - return iter - - -def setup_profile(iter, cl, runtime, N, nf, opset, dtype, optim): - cl.setup(runtime, N, nf, opset, dtype, optim) - return profile(iter, cl, runtime, N, nf, opset, dtype, optim) - - -cl = start() -iter = None -print(datetime.now(), "begin") -""" - - -def _sklearn_subfolder(model): - """ - Returns the list of subfolders for a model. - """ - mod = model.__module__ - if mod is not None and mod.startswith('mlinsights'): - return ['mlinsights', model.__name__] # pragma: no cover - spl = mod.split('.') - try: - pos = spl.index('sklearn') - except ValueError as e: # pragma: no cover - raise ValueError( - "Unable to find 'sklearn' in '{}'.".format(mod)) from e - res = spl[pos + 1: -1] - if len(res) == 0: - if spl[-1] == 'sklearn': - res = ['_externals'] - elif spl[0] == 'sklearn': - res = spl[pos + 1:] - else: - raise ValueError( # pragma: no cover - "Unable to guess subfolder for '{}'.".format(model.__class__)) - res.append(model.__name__) - return res - - -def _handle_init_files(model, flat, location, verbose, location_pyspy, fLOG): - "Returns created, location_model, prefix_import." - if flat: - return ([], location, ".", - (None if location_pyspy is None else location_pyspy)) - - created = [] - subf = _sklearn_subfolder(model) - subf = [_ for _ in subf if _[0] != '_' or _ == '_externals'] - location_model = os.path.join(location, *subf) - prefix_import = "." * (len(subf) + 1) - if not os.path.exists(location_model): - os.makedirs(location_model) - for fold in [location_model, os.path.dirname(location_model), - os.path.dirname(os.path.dirname(location_model))]: - init = os.path.join(fold, '__init__.py') - if not os.path.exists(init): - with open(init, 'w') as _: - pass - created.append(init) - if verbose > 1 and fLOG is not None: - fLOG("[create_asv_benchmark] create '{}'.".format(init)) - if location_pyspy is not None: - location_pyspy_model = os.path.join(location_pyspy, *subf) - if not os.path.exists(location_pyspy_model): - os.makedirs(location_pyspy_model) - else: - location_pyspy_model = None - - return created, location_model, prefix_import, location_pyspy_model - - -def _asv_class_name(model, scenario, optimisation, - extra, dofit, conv_options, problem, - shorten=True): - - def clean_str(val): - s = str(val) - r = "" - for c in s: - if c in ",-\n": - r += "_" - continue - if c in ": =.+()[]{}\"'<>~": - continue - r += c - for k, v in {'n_estimators': 'nest', - 'max_iter': 'mxit'}.items(): - r = r.replace(k, v) - return r - - def clean_str_list(val): - if val is None: - return "" # pragma: no cover - if isinstance(val, list): - return ".".join( # pragma: no cover - clean_str_list(v) for v in val if v) - return clean_str(val) - - els = ['bench', model.__name__, scenario, clean_str(problem)] - if not dofit: - els.append('nofit') - if extra: - if 'random_state' in extra and extra['random_state'] == 42: - extra2 = extra.copy() - del extra2['random_state'] - if extra2: - els.append(clean_str(extra2)) - else: - els.append(clean_str(extra)) - if optimisation: - els.append(clean_str_list(optimisation)) - if conv_options: - els.append(clean_str_list(conv_options)) - res = ".".join(els).replace("-", "_") - - if shorten: - rep = { - 'ConstantKernel': 'Cst', - 'DotProduct': 'Dot', - 'Exponentiation': 'Exp', - 'ExpSineSquared': 'ExpS2', - 'GaussianProcess': 'GaussProc', - 'GaussianMixture': 'GaussMixt', - 'HistGradientBoosting': 'HGB', - 'LinearRegression': 'LinReg', - 'LogisticRegression': 'LogReg', - 'MultiOutput': 'MultOut', - 'OrthogonalMatchingPursuit': 'OrthMatchPurs', - 'PairWiseKernel': 'PW', - 'Product': 'Prod', - 'RationalQuadratic': 'RQ', - 'WhiteKernel': 'WK', - 'length_scale': 'ls', - 'periodicity': 'pcy', - } - for k, v in rep.items(): - res = res.replace(k, v) - - rep = { - 'Classifier': 'Clas', - 'Regressor': 'Reg', - 'KNeighbors': 'KNN', - 'NearestNeighbors': 'kNN', - 'RadiusNeighbors': 'RadNN', - } - for k, v in rep.items(): - res = res.replace(k, v) - - if len(res) > 70: # shorten filename - m = hashlib.sha256() - m.update(res.encode('utf-8')) - sh = m.hexdigest() - if len(sh) > 6: - sh = sh[:6] - res = res[:70] + sh - return res - - -def _read_patterns(): - """ - Reads the testing pattern. - """ - # Reads the template - patterns = {} - for suffix in ['classifier', 'classifier_raw_scores', 'regressor', 'clustering', - 'outlier', 'trainable_transform', 'transform', - 'multi_classifier', 'transform_positive']: - template_name = os.path.join(os.path.dirname( - __file__), "template", "skl_model_%s.py" % suffix) - if not os.path.exists(template_name): - raise FileNotFoundError( # pragma: no cover - "Template '{}' was not found.".format(template_name)) - with open(template_name, "r", encoding="utf-8") as f: - content = f.read() - initial_content = '"""'.join(content.split('"""')[2:]) - patterns[suffix] = initial_content - return patterns - - -def _select_pattern_problem(prob, patterns): - """ - Selects a benchmark type based on the problem kind. - """ - if '-reg' in prob: - return patterns['regressor'] - if '-cl' in prob and '-dec' in prob: - return patterns['classifier_raw_scores'] - if '-cl' in prob: - return patterns['classifier'] - if 'cluster' in prob: - return patterns['clustering'] - if 'outlier' in prob: - return patterns['outlier'] - if 'num+y-tr' in prob: - return patterns['trainable_transform'] - if 'num-tr-pos' in prob: - return patterns['transform_positive'] - if 'num-tr' in prob: - return patterns['transform'] - if 'm-label' in prob: - return patterns['multi_classifier'] - raise ValueError( # pragma: no cover - "Unable to guess the right pattern for '{}'.".format(prob)) - - -def _display_code_lines(code): - rows = ["%03d %s" % (i + 1, line) - for i, line in enumerate(code.split("\n"))] - return "\n".join(rows) - - -def _format_dict(opts, indent): - """ - Formats a dictionary as code. - """ - rows = [] - for k, v in sorted(opts.items()): - rows.append('%s=%r' % (k, v)) - content = ', '.join(rows) - st1 = "\n".join(textwrap.wrap(content)) - return textwrap.indent(st1, prefix=' ' * indent) - - -def _additional_imports(model_name): - """ - Adds additional imports for experimental models. - """ - if model_name == 'IterativeImputer': - return ["from sklearn.experimental import enable_iterative_imputer # pylint: disable=W0611"] - if model_name in ('HistGradientBoostingClassifier', 'HistGradientBoostingClassifier'): - return ["from sklearn.experimental import enable_hist_gradient_boosting # pylint: disable=W0611"] - return None - - -def add_model_import_init( - class_content, model, optimisation=None, - extra=None, conv_options=None): - """ - Modifies a template such as @see cl TemplateBenchmarkClassifier - with code associated to the model *model*. - - @param class_content template (as a string) - @param model model class - @param optimisation model optimisation - @param extra addition parameter to the constructor - @param conv_options options for the conversion to ONNX - @returm modified template - """ - add_imports = [] - add_methods = [] - add_params = ["par_modelname = '%s'" % model.__name__, - "par_extra = %r" % extra] - - # additional methods and imports - if optimisation is not None: - add_imports.append( - 'from mlprodict.onnx_tools.optim import onnx_optimisations') - if optimisation == 'onnx': - add_methods.append(textwrap.dedent(''' - def _optimize_onnx(self, onx): - return onnx_optimisations(onx)''')) - add_params.append('par_optimonnx = True') - elif isinstance(optimisation, dict): - add_methods.append(textwrap.dedent(''' - def _optimize_onnx(self, onx): - return onnx_optimisations(onx, self.par_optims)''')) - add_params.append('par_optims = {}'.format( - _format_dict(optimisation, indent=4))) - else: - raise ValueError( # pragma: no cover - "Unable to interpret optimisation {}.".format(optimisation)) - - # look for import place - lines = class_content.split('\n') - keep = None - for pos, line in enumerate(lines): - if "# Import specific to this model." in line: - keep = pos - break - if keep is None: - raise RuntimeError( # pragma: no cover - "Unable to locate where to insert import in\n{}\n".format( - class_content)) - - # imports - loc_class = model.__module__ - sub = loc_class.split('.') - if 'sklearn' not in sub: - mod = loc_class - else: - skl = sub.index('sklearn') - if skl == 0: - if sub[-1].startswith("_"): - mod = '.'.join(sub[skl:-1]) - else: - mod = '.'.join(sub[skl:]) - else: - mod = '.'.join(sub[:-1]) - - exp_imports = _additional_imports(model.__name__) - if exp_imports: - add_imports.extend(exp_imports) - imp_inst = ( - "try:\n from {0} import {1}\nexcept ImportError:\n {1} = None" - "".format(mod, model.__name__)) - add_imports.append(imp_inst) - add_imports.append("# __IMPORTS__") - lines[keep + 1] = "\n".join(add_imports) - content = "\n".join(lines) - - # _create_model - content = content.split('def _create_model(self):')[0].strip(' \n') - lines = [content, "", " def _create_model(self):"] - if extra is not None and len(extra) > 0: - lines.append(" return {}(".format(model.__name__)) - lines.append(_format_dict(set_n_jobs(model, extra), 12)) - lines.append(" )") - else: - lines.append(" return {}()".format(model.__name__)) - lines.append("") - - # methods - for meth in add_methods: - lines.append(textwrap.indent(meth, ' ')) - lines.append('') - - # end - return "\n".join(lines), add_params - - -def find_missing_sklearn_imports(pieces): - """ - Finds in :epkg:`scikit-learn` the missing pieces. - - @param pieces list of names in scikit-learn - @return list of corresponding imports - """ - res = {} - for piece in pieces: - mod = find_sklearn_module(piece) - if mod not in res: - res[mod] = [] - res[mod].append(piece) - - lines = [] - for k, v in res.items(): - lines.append("from {} import {}".format( - k, ", ".join(sorted(v)))) - return lines - - -def find_sklearn_module(piece): - """ - Finds the corresponding modulee for an element of :epkg:`scikit-learn`. - - @param piece name to import - @return module name - - The implementation is not intelligence and should - be improved. It is a kind of white list. - """ - glo = globals() - if piece in {'LinearRegression', 'LogisticRegression', - 'SGDClassifier'}: - import sklearn.linear_model - glo[piece] = getattr(sklearn.linear_model, piece) - return "sklearn.linear_model" - if piece in {'DecisionTreeRegressor', 'DecisionTreeClassifier'}: - import sklearn.tree - glo[piece] = getattr(sklearn.tree, piece) - return "sklearn.tree" - if piece in {'ExpSineSquared', 'DotProduct', 'RationalQuadratic', 'RBF'}: - import sklearn.gaussian_process.kernels - glo[piece] = getattr(sklearn.gaussian_process.kernels, piece) - return "sklearn.gaussian_process.kernels" - if piece in {'LinearSVC', 'LinearSVR', 'NuSVR', 'SVR', 'SVC', 'NuSVC'}: # pragma: no cover - import sklearn.svm - glo[piece] = getattr(sklearn.svm, piece) - return "sklearn.svm" - if piece in {'KMeans'}: # pragma: no cover - import sklearn.cluster - glo[piece] = getattr(sklearn.cluster, piece) - return "sklearn.cluster" - if piece in {'OneVsRestClassifier', 'OneVsOneClassifier'}: # pragma: no cover - import sklearn.multiclass - glo[piece] = getattr(sklearn.multiclass, piece) - return "sklearn.multiclass" - raise ValueError( # pragma: no cover - "Unable to find module to import for '{}'.".format(piece)) +""" +@file Functions to creates a benchmark based on :epkg:`asv` +for many regressors and classifiers. +""" +import os +import textwrap +import hashlib +try: + from ..onnx_tools.optim.sklearn_helper import set_n_jobs +except (ValueError, ImportError): # pragma: no cover + from mlprodict.onnx_tools.optim.sklearn_helper import set_n_jobs + +# exec function does not import models but potentially +# requires all specific models used to defines scenarios +try: + from ..onnxrt.validate.validate_scenarios import * # pylint: disable=W0614,W0401 +except (ValueError, ImportError): # pragma: no cover + # Skips this step if used in a benchmark. + pass + + +default_asv_conf = { + "version": 1, + "project": "mlprodict", + "project_url": "http://www.xavierdupre.fr/app/mlprodict/helpsphinx/index.html", + "repo": "https://github.com/sdpython/mlprodict.git", + "repo_subdir": "", + "install_command": ["python -mpip install {wheel_file}"], + "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"], + "build_command": [ + "python setup.py build", + "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}" + ], + "branches": ["master"], + "environment_type": "virtualenv", + "install_timeout": 600, + "show_commit_url": "https://github.com/sdpython/mlprodict/commit/", + # "pythons": ["__PYVER__"], + "matrix": { + "cython": [], + "jinja2": [], + "joblib": [], + "lightgbm": [], + "mlinsights": [], + "numpy": [], + "onnx": ["http://localhost:8067/simple/"], + "onnxruntime": ["http://localhost:8067/simple/"], + "pandas": [], + "Pillow": [], + "pybind11": [], + "pyquickhelper": [], + "scipy": [], + # "git+https://github.com/xadupre/onnxconverter-common.git@jenkins"], + "onnxconverter-common": ["http://localhost:8067/simple/"], + # "git+https://github.com/xadupre/sklearn-onnx.git@jenkins"], + "skl2onnx": ["http://localhost:8067/simple/"], + # "git+https://github.com/scikit-learn/scikit-learn.git"], + "scikit-learn": ["http://localhost:8067/simple/"], + "xgboost": [], + }, + "benchmark_dir": "benches", + "env_dir": "env", + "results_dir": "results", + "html_dir": "html", +} + +flask_helper = """ +''' +Local ASV files do no properly render in a browser, +it needs to be served through a server. +''' +import os.path +from flask import Flask, Response + +app = Flask(__name__) +app.config.from_object(__name__) + + +def root_dir(): + return os.path.join(os.path.abspath(os.path.dirname(__file__)), "..", "html") + + +def get_file(filename): # pragma: no cover + try: + src = os.path.join(root_dir(), filename) + with open(src, "r", encoding="utf-8", errors="ignore") as f: + return f.read() + except IOError as exc: + return str(exc) + + +@app.route('/', methods=['GET']) +def mainpage(): + content = get_file('index.html') + return Response(content, mimetype="text/html") + + +@app.route('/', defaults={'path': ''}) +@app.route('/') +def get_resource(path): # pragma: no cover + mimetypes = { + ".css": "text/css", + ".html": "text/html", + ".js": "application/javascript", + } + complete_path = os.path.join(root_dir(), path) + ext = os.path.splitext(path)[1] + mimetype = mimetypes.get(ext, "text/html") + content = get_file(complete_path) + return Response(content, mimetype=mimetype) + + +if __name__ == '__main__': # pragma: no cover + app.run( # ssl_context=('cert.pem', 'key.pem'), + port=8877, + # host="", + ) +""" + +pyspy_template = """ +import sys +sys.path.append(r"__PATH__") +from __PYFOLD__ import __CLASSNAME__ +import time +from datetime import datetime + + +def start(): + cl = __CLASSNAME__() + cl.setup_cache() + return cl + + +def profile0(iter, cl, runtime, N, nf, opset, dtype, optim): + begin = time.perf_counter() + for i in range(0, 100): + cl.time_predict(runtime, N, nf, opset, dtype, optim) + duration = time.perf_counter() - begin + iter = max(100, int(25 / duration * 100)) # 25 seconds + return iter + + +def setup_profile0(iter, cl, runtime, N, nf, opset, dtype, optim): + cl.setup(runtime, N, nf, opset, dtype, optim) + return profile0(iter, cl, runtime, N, nf, opset, dtype, optim) + + +def profile(iter, cl, runtime, N, nf, opset, dtype, optim): + for i in range(iter): + cl.time_predict(runtime, N, nf, opset, dtype, optim) + return iter + + +def setup_profile(iter, cl, runtime, N, nf, opset, dtype, optim): + cl.setup(runtime, N, nf, opset, dtype, optim) + return profile(iter, cl, runtime, N, nf, opset, dtype, optim) + + +cl = start() +iter = None +print(datetime.now(), "begin") +""" + + +def _sklearn_subfolder(model): + """ + Returns the list of subfolders for a model. + """ + mod = model.__module__ + if mod is not None and mod.startswith('mlinsights'): + return ['mlinsights', model.__name__] # pragma: no cover + spl = mod.split('.') + try: + pos = spl.index('sklearn') + except ValueError as e: # pragma: no cover + raise ValueError( + "Unable to find 'sklearn' in '{}'.".format(mod)) from e + res = spl[pos + 1: -1] + if len(res) == 0: + if spl[-1] == 'sklearn': + res = ['_externals'] + elif spl[0] == 'sklearn': + res = spl[pos + 1:] + else: + raise ValueError( # pragma: no cover + "Unable to guess subfolder for '{}'.".format(model.__class__)) + res.append(model.__name__) + return res + + +def _handle_init_files(model, flat, location, verbose, location_pyspy, fLOG): + "Returns created, location_model, prefix_import." + if flat: + return ([], location, ".", + (None if location_pyspy is None else location_pyspy)) + + created = [] + subf = _sklearn_subfolder(model) + subf = [_ for _ in subf if _[0] != '_' or _ == '_externals'] + location_model = os.path.join(location, *subf) + prefix_import = "." * (len(subf) + 1) + if not os.path.exists(location_model): + os.makedirs(location_model) + for fold in [location_model, os.path.dirname(location_model), + os.path.dirname(os.path.dirname(location_model))]: + init = os.path.join(fold, '__init__.py') + if not os.path.exists(init): + with open(init, 'w') as _: + pass + created.append(init) + if verbose > 1 and fLOG is not None: + fLOG("[create_asv_benchmark] create '{}'.".format(init)) + if location_pyspy is not None: + location_pyspy_model = os.path.join(location_pyspy, *subf) + if not os.path.exists(location_pyspy_model): + os.makedirs(location_pyspy_model) + else: + location_pyspy_model = None + + return created, location_model, prefix_import, location_pyspy_model + + +def _asv_class_name(model, scenario, optimisation, + extra, dofit, conv_options, problem, + shorten=True): + + def clean_str(val): + s = str(val) + r = "" + for c in s: + if c in ",-\n": + r += "_" + continue + if c in ": =.+()[]{}\"'<>~": + continue + r += c + for k, v in {'n_estimators': 'nest', + 'max_iter': 'mxit'}.items(): + r = r.replace(k, v) + return r + + def clean_str_list(val): + if val is None: + return "" # pragma: no cover + if isinstance(val, list): + return ".".join( # pragma: no cover + clean_str_list(v) for v in val if v) + return clean_str(val) + + els = ['bench', model.__name__, scenario, clean_str(problem)] + if not dofit: + els.append('nofit') + if extra: + if 'random_state' in extra and extra['random_state'] == 42: + extra2 = extra.copy() + del extra2['random_state'] + if extra2: + els.append(clean_str(extra2)) + else: + els.append(clean_str(extra)) + if optimisation: + els.append(clean_str_list(optimisation)) + if conv_options: + els.append(clean_str_list(conv_options)) + res = ".".join(els).replace("-", "_") + + if shorten: + rep = { + 'ConstantKernel': 'Cst', + 'DotProduct': 'Dot', + 'Exponentiation': 'Exp', + 'ExpSineSquared': 'ExpS2', + 'GaussianProcess': 'GaussProc', + 'GaussianMixture': 'GaussMixt', + 'HistGradientBoosting': 'HGB', + 'LinearRegression': 'LinReg', + 'LogisticRegression': 'LogReg', + 'MultiOutput': 'MultOut', + 'OrthogonalMatchingPursuit': 'OrthMatchPurs', + 'PairWiseKernel': 'PW', + 'Product': 'Prod', + 'RationalQuadratic': 'RQ', + 'WhiteKernel': 'WK', + 'length_scale': 'ls', + 'periodicity': 'pcy', + } + for k, v in rep.items(): + res = res.replace(k, v) + + rep = { + 'Classifier': 'Clas', + 'Regressor': 'Reg', + 'KNeighbors': 'KNN', + 'NearestNeighbors': 'kNN', + 'RadiusNeighbors': 'RadNN', + } + for k, v in rep.items(): + res = res.replace(k, v) + + if len(res) > 70: # shorten filename + m = hashlib.sha256() + m.update(res.encode('utf-8')) + sh = m.hexdigest() + if len(sh) > 6: + sh = sh[:6] + res = res[:70] + sh + return res + + +def _read_patterns(): + """ + Reads the testing pattern. + """ + # Reads the template + patterns = {} + for suffix in ['classifier', 'classifier_raw_scores', 'regressor', 'clustering', + 'outlier', 'trainable_transform', 'transform', + 'multi_classifier', 'transform_positive']: + template_name = os.path.join(os.path.dirname( + __file__), "template", "skl_model_%s.py" % suffix) + if not os.path.exists(template_name): + raise FileNotFoundError( # pragma: no cover + "Template '{}' was not found.".format(template_name)) + with open(template_name, "r", encoding="utf-8") as f: + content = f.read() + initial_content = '"""'.join(content.split('"""')[2:]) + patterns[suffix] = initial_content + return patterns + + +def _select_pattern_problem(prob, patterns): + """ + Selects a benchmark type based on the problem kind. + """ + if '-reg' in prob: + return patterns['regressor'] + if '-cl' in prob and '-dec' in prob: + return patterns['classifier_raw_scores'] + if '-cl' in prob: + return patterns['classifier'] + if 'cluster' in prob: + return patterns['clustering'] + if 'outlier' in prob: + return patterns['outlier'] + if 'num+y-tr' in prob: + return patterns['trainable_transform'] + if 'num-tr-pos' in prob: + return patterns['transform_positive'] + if 'num-tr' in prob: + return patterns['transform'] + if 'm-label' in prob: + return patterns['multi_classifier'] + raise ValueError( # pragma: no cover + "Unable to guess the right pattern for '{}'.".format(prob)) + + +def _display_code_lines(code): + rows = ["%03d %s" % (i + 1, line) + for i, line in enumerate(code.split("\n"))] + return "\n".join(rows) + + +def _format_dict(opts, indent): + """ + Formats a dictionary as code. + """ + rows = [] + for k, v in sorted(opts.items()): + rows.append('%s=%r' % (k, v)) + content = ', '.join(rows) + st1 = "\n".join(textwrap.wrap(content)) + return textwrap.indent(st1, prefix=' ' * indent) + + +def _additional_imports(model_name): + """ + Adds additional imports for experimental models. + """ + if model_name == 'IterativeImputer': + return ["from sklearn.experimental import enable_iterative_imputer # pylint: disable=W0611"] + if model_name in ('HistGradientBoostingClassifier', 'HistGradientBoostingClassifier'): + return ["from sklearn.experimental import enable_hist_gradient_boosting # pylint: disable=W0611"] + return None + + +def add_model_import_init( + class_content, model, optimisation=None, + extra=None, conv_options=None): + """ + Modifies a template such as @see cl TemplateBenchmarkClassifier + with code associated to the model *model*. + + @param class_content template (as a string) + @param model model class + @param optimisation model optimisation + @param extra addition parameter to the constructor + @param conv_options options for the conversion to ONNX + @returm modified template + """ + add_imports = [] + add_methods = [] + add_params = ["par_modelname = '%s'" % model.__name__, + "par_extra = %r" % extra] + + # additional methods and imports + if optimisation is not None: + add_imports.append( + 'from mlprodict.onnx_tools.optim import onnx_optimisations') + if optimisation == 'onnx': + add_methods.append(textwrap.dedent(''' + def _optimize_onnx(self, onx): + return onnx_optimisations(onx)''')) + add_params.append('par_optimonnx = True') + elif isinstance(optimisation, dict): + add_methods.append(textwrap.dedent(''' + def _optimize_onnx(self, onx): + return onnx_optimisations(onx, self.par_optims)''')) + add_params.append('par_optims = {}'.format( + _format_dict(optimisation, indent=4))) + else: + raise ValueError( # pragma: no cover + "Unable to interpret optimisation {}.".format(optimisation)) + + # look for import place + lines = class_content.split('\n') + keep = None + for pos, line in enumerate(lines): + if "# Import specific to this model." in line: + keep = pos + break + if keep is None: + raise RuntimeError( # pragma: no cover + "Unable to locate where to insert import in\n{}\n".format( + class_content)) + + # imports + loc_class = model.__module__ + sub = loc_class.split('.') + if 'sklearn' not in sub: + mod = loc_class + else: + skl = sub.index('sklearn') + if skl == 0: + if sub[-1].startswith("_"): + mod = '.'.join(sub[skl:-1]) + else: + mod = '.'.join(sub[skl:]) + else: + mod = '.'.join(sub[:-1]) + + exp_imports = _additional_imports(model.__name__) + if exp_imports: + add_imports.extend(exp_imports) + imp_inst = ( + "try:\n from {0} import {1}\nexcept ImportError:\n {1} = None" + "".format(mod, model.__name__)) + add_imports.append(imp_inst) + add_imports.append("# __IMPORTS__") + lines[keep + 1] = "\n".join(add_imports) + content = "\n".join(lines) + + # _create_model + content = content.split('def _create_model(self):', + maxsplit=1)[0].strip(' \n') + lines = [content, "", " def _create_model(self):"] + if extra is not None and len(extra) > 0: + lines.append(" return {}(".format(model.__name__)) + lines.append(_format_dict(set_n_jobs(model, extra), 12)) + lines.append(" )") + else: + lines.append(" return {}()".format(model.__name__)) + lines.append("") + + # methods + for meth in add_methods: + lines.append(textwrap.indent(meth, ' ')) + lines.append('') + + # end + return "\n".join(lines), add_params + + +def find_missing_sklearn_imports(pieces): + """ + Finds in :epkg:`scikit-learn` the missing pieces. + + @param pieces list of names in scikit-learn + @return list of corresponding imports + """ + res = {} + for piece in pieces: + mod = find_sklearn_module(piece) + if mod not in res: + res[mod] = [] + res[mod].append(piece) + + lines = [] + for k, v in res.items(): + lines.append("from {} import {}".format( + k, ", ".join(sorted(v)))) + return lines + + +def find_sklearn_module(piece): + """ + Finds the corresponding modulee for an element of :epkg:`scikit-learn`. + + @param piece name to import + @return module name + + The implementation is not intelligence and should + be improved. It is a kind of white list. + """ + glo = globals() + if piece in {'LinearRegression', 'LogisticRegression', + 'SGDClassifier'}: + import sklearn.linear_model + glo[piece] = getattr(sklearn.linear_model, piece) + return "sklearn.linear_model" + if piece in {'DecisionTreeRegressor', 'DecisionTreeClassifier'}: + import sklearn.tree + glo[piece] = getattr(sklearn.tree, piece) + return "sklearn.tree" + if piece in {'ExpSineSquared', 'DotProduct', 'RationalQuadratic', 'RBF'}: + import sklearn.gaussian_process.kernels + glo[piece] = getattr(sklearn.gaussian_process.kernels, piece) + return "sklearn.gaussian_process.kernels" + if piece in {'LinearSVC', 'LinearSVR', 'NuSVR', 'SVR', 'SVC', 'NuSVC'}: # pragma: no cover + import sklearn.svm + glo[piece] = getattr(sklearn.svm, piece) + return "sklearn.svm" + if piece in {'KMeans'}: # pragma: no cover + import sklearn.cluster + glo[piece] = getattr(sklearn.cluster, piece) + return "sklearn.cluster" + if piece in {'OneVsRestClassifier', 'OneVsOneClassifier'}: # pragma: no cover + import sklearn.multiclass + glo[piece] = getattr(sklearn.multiclass, piece) + return "sklearn.multiclass" + raise ValueError( # pragma: no cover + "Unable to find module to import for '{}'.".format(piece)) diff --git a/mlprodict/cli/convert_validate.py b/mlprodict/cli/convert_validate.py index 2891c7eac..4c9068c31 100644 --- a/mlprodict/cli/convert_validate.py +++ b/mlprodict/cli/convert_validate.py @@ -160,7 +160,7 @@ def convert_validate(pkl, data=None, schema=None, schema = [ # pragma: no cover ('X', tensor_type([None, df.shape[1]]))] if len(schema) == 1: - df = df.values + df = df.values # pylint: disable=E1101 if verbose > 0: fLOG("[convert_validate] data schema={}".format(schema)) diff --git a/mlprodict/npy/onnx_numpy_annotation.py b/mlprodict/npy/onnx_numpy_annotation.py index 056ad94dd..803ae79b8 100644 --- a/mlprodict/npy/onnx_numpy_annotation.py +++ b/mlprodict/npy/onnx_numpy_annotation.py @@ -80,7 +80,7 @@ class ShapeType: def __init__(self, params): self.__args__ = params - def __class_getitem__(cls, params): # pylint: disable=W0221 + def __class_getitem__(cls, params): # pylint: disable=W0221,W0237 "Overwrites this method." if not isinstance(params, tuple): params = (params,) # pragma: no cover diff --git a/mlprodict/onnx_tools/optim/onnx_optimisation.py b/mlprodict/onnx_tools/optim/onnx_optimisation.py index 745aa009a..1a5079e0d 100644 --- a/mlprodict/onnx_tools/optim/onnx_optimisation.py +++ b/mlprodict/onnx_tools/optim/onnx_optimisation.py @@ -1,42 +1,43 @@ -""" -@file -@brief Optimisations of :epkg:`ONNX` graphs. -""" -from ._onnx_optimisation_common import _apply_optimisation_on_graph -from .onnx_optimisation_identity import onnx_remove_node_identity -from .onnx_optimisation_redundant import onnx_remove_node_redundant -from .onnx_optimisation_unused import onnx_remove_node_unused - - -def onnx_remove_node(onnx_model, recursive=True, debug_info=None, **options): - """ - Removes as many nodes as possible without changing - the outcome. It applies @see fn onnx_remove_node_identity, - then @see fn onnx_remove_node_redundant. - - @param onnx_model onnx model - @param recursive looks into subgraphs - @param debug_info debug information (private) - @param options additional options - @return new onnx _model - """ - if debug_info is None: - debug_info = [str(type(onnx_model)).split('.')[-1].strip("'>")] - else: - debug_info = debug_info + \ - [str(type(onnx_model)).split('.')[-1].strip("'>")] - - if hasattr(onnx_model, 'graph'): - return _apply_optimisation_on_graph( - onnx_remove_node, onnx_model, - recursive=recursive, debug_info=debug_info, - **options) - - graph = onnx_model - graph = onnx_remove_node_unused( - graph, recursive=recursive, debug_info=debug_info, **options) - graph = onnx_remove_node_identity( - graph, recursive=recursive, debug_info=debug_info, **options) - graph = onnx_remove_node_redundant( - graph, recursive=recursive, debug_info=debug_info, **options) - return graph +""" +@file +@brief Optimisations of :epkg:`ONNX` graphs. +""" +from ._onnx_optimisation_common import _apply_optimisation_on_graph +from .onnx_optimisation_identity import onnx_remove_node_identity +from .onnx_optimisation_redundant import onnx_remove_node_redundant +from .onnx_optimisation_unused import onnx_remove_node_unused + + +def onnx_remove_node(onnx_model, recursive=True, debug_info=None, **options): + """ + Removes as many nodes as possible without changing + the outcome. It applies @see fn onnx_remove_node_identity, + then @see fn onnx_remove_node_redundant. + + @param onnx_model onnx model + @param recursive looks into subgraphs + @param debug_info debug information (private) + @param options additional options + @return new onnx _model + """ + if debug_info is None: + debug_info = [str(type(onnx_model)).rsplit( + '.', maxsplit=1)[-1].strip("'>")] + else: + debug_info = (debug_info + + [str(type(onnx_model)).rsplit('.', maxsplit=1)[-1].strip("'>")]) + + if hasattr(onnx_model, 'graph'): + return _apply_optimisation_on_graph( + onnx_remove_node, onnx_model, + recursive=recursive, debug_info=debug_info, + **options) + + graph = onnx_model + graph = onnx_remove_node_unused( + graph, recursive=recursive, debug_info=debug_info, **options) + graph = onnx_remove_node_identity( + graph, recursive=recursive, debug_info=debug_info, **options) + graph = onnx_remove_node_redundant( + graph, recursive=recursive, debug_info=debug_info, **options) + return graph diff --git a/mlprodict/onnx_tools/optim/onnx_optimisation_identity.py b/mlprodict/onnx_tools/optim/onnx_optimisation_identity.py index 4d52957bc..f721b0aaa 100644 --- a/mlprodict/onnx_tools/optim/onnx_optimisation_identity.py +++ b/mlprodict/onnx_tools/optim/onnx_optimisation_identity.py @@ -1,119 +1,120 @@ -""" -@file -@brief Optimisation of :epkg:`ONNX` graphs. -""" -from onnx.helper import make_graph -from ._onnx_optimisation_common import ( # pylint: disable=E0611 - _rename_node_input, - _rename_node_output, - _apply_optimisation_on_graph, - _apply_remove_node_fct_node -) - - -def onnx_remove_node_identity(onnx_model, recursive=True, debug_info=None, **options): - """ - Removes as many *Identity* nodes as possible. - The function looks into every node and subgraphs if - *recursive* is True for identity node. Unless such a - node directy connects one input to one output, it will - be removed and every other node gets its inputs or - outputs accordingly renamed. - - @param onnx_model onnx model - @param recursive looks into subgraphs - @param debug_info debug information (private) - @param options additional options (unused) - @return new onnx _model - """ - if debug_info is None: - debug_info = [str(type(onnx_model)).split('.')[-1].strip("'>")] - else: - debug_info = debug_info + \ - [str(type(onnx_model)).split('.')[-1].strip("'>")] - - if hasattr(onnx_model, 'graph'): - return _apply_optimisation_on_graph( - onnx_remove_node_identity, onnx_model, - recursive=recursive, debug_info=debug_info, **options) - - graph = onnx_model - - inputs = set(i.name for i in graph.input) - outputs = set(o.name for o in graph.output) - - def retrieve_idnodes(graph, existing_nodes): - idnodes = [] - for i, exnode in enumerate(existing_nodes): - if exnode is None: - continue - if exnode.op_type == 'Identity': - input = exnode.input[0] - output = exnode.output[0] - idnodes.append((i, exnode, input, output)) - return idnodes - - nodes = list(graph.node) - rem = 1 - while rem > 0: - rem = 0 - idnodes = retrieve_idnodes(graph, nodes) - restart = False - for i, _, inp, out in idnodes: - if restart: - break # pragma: no cover - if nodes[i] is None: - # Already removed. - continue # pragma: no cover - if inp in inputs and out in outputs: - # Cannot be removed. - continue - if not restart and out not in outputs: - # We cannot change an output name. - for j in range(len(nodes)): # pylint: disable=C0200 - if nodes[j] is None: - continue - if out in nodes[j].input: - nodes[j] = _rename_node_input(nodes[j], out, inp) - rem += 1 - if nodes[j].op_type == 'Identity': - restart = True # pragma: no cover - nodes[i] = None - rem += 1 - continue - if not restart and inp not in inputs and inp not in outputs: - # We cannot change an input name or an output name. - for j in range(len(nodes)): # pylint: disable=C0200 - if nodes[j] is None: - continue - if inp in nodes[j].output: - nodes[j] = _rename_node_output(nodes[j], inp, out) - rem += 1 - if nodes[j].op_type == 'Identity': - restart = True # pragma: no cover - if inp in nodes[j].input: - nodes[j] = _rename_node_input(nodes[j], inp, out) - rem += 1 - if nodes[j].op_type == 'Identity': - restart = True - nodes[i] = None - rem += 1 - - if recursive: - # Handles subgraphs. - for i in range(len(nodes)): # pylint: disable=C0200 - node = nodes[i] - if node is None or not (node.attribute): # pylint: disable=C0325 - continue - nodes[i] = _apply_remove_node_fct_node( - onnx_remove_node_identity, - node, recursive=True, debug_info=debug_info + [node.name]) - - # Finally create the new graph. - nodes = list(filter(lambda n: n is not None, nodes)) - graph = make_graph(nodes, onnx_model.name, - onnx_model.input, onnx_model.output, - onnx_model.initializer) - - graph.value_info.extend(onnx_model.value_info) # pylint: disable=E1101 - return graph +""" +@file +@brief Optimisation of :epkg:`ONNX` graphs. +""" +from onnx.helper import make_graph +from ._onnx_optimisation_common import ( # pylint: disable=E0611 + _rename_node_input, + _rename_node_output, + _apply_optimisation_on_graph, + _apply_remove_node_fct_node +) + + +def onnx_remove_node_identity(onnx_model, recursive=True, debug_info=None, **options): + """ + Removes as many *Identity* nodes as possible. + The function looks into every node and subgraphs if + *recursive* is True for identity node. Unless such a + node directy connects one input to one output, it will + be removed and every other node gets its inputs or + outputs accordingly renamed. + + @param onnx_model onnx model + @param recursive looks into subgraphs + @param debug_info debug information (private) + @param options additional options (unused) + @return new onnx _model + """ + if debug_info is None: + debug_info = [str(type(onnx_model)).rsplit( + '.', maxsplit=1)[-1].strip("'>")] + else: + debug_info = (debug_info + + [str(type(onnx_model)).rsplit('.', maxsplit=1)[-1].strip("'>")]) + + if hasattr(onnx_model, 'graph'): + return _apply_optimisation_on_graph( + onnx_remove_node_identity, onnx_model, + recursive=recursive, debug_info=debug_info, **options) + + graph = onnx_model + + inputs = set(i.name for i in graph.input) + outputs = set(o.name for o in graph.output) + + def retrieve_idnodes(graph, existing_nodes): + idnodes = [] + for i, exnode in enumerate(existing_nodes): + if exnode is None: + continue + if exnode.op_type == 'Identity': + input = exnode.input[0] + output = exnode.output[0] + idnodes.append((i, exnode, input, output)) + return idnodes + + nodes = list(graph.node) + rem = 1 + while rem > 0: + rem = 0 + idnodes = retrieve_idnodes(graph, nodes) + restart = False + for i, _, inp, out in idnodes: + if restart: + break # pragma: no cover + if nodes[i] is None: + # Already removed. + continue # pragma: no cover + if inp in inputs and out in outputs: + # Cannot be removed. + continue + if not restart and out not in outputs: + # We cannot change an output name. + for j in range(len(nodes)): # pylint: disable=C0200 + if nodes[j] is None: + continue + if out in nodes[j].input: + nodes[j] = _rename_node_input(nodes[j], out, inp) + rem += 1 + if nodes[j].op_type == 'Identity': + restart = True # pragma: no cover + nodes[i] = None + rem += 1 + continue + if not restart and inp not in inputs and inp not in outputs: + # We cannot change an input name or an output name. + for j in range(len(nodes)): # pylint: disable=C0200 + if nodes[j] is None: + continue + if inp in nodes[j].output: + nodes[j] = _rename_node_output(nodes[j], inp, out) + rem += 1 + if nodes[j].op_type == 'Identity': + restart = True # pragma: no cover + if inp in nodes[j].input: + nodes[j] = _rename_node_input(nodes[j], inp, out) + rem += 1 + if nodes[j].op_type == 'Identity': + restart = True + nodes[i] = None + rem += 1 + + if recursive: + # Handles subgraphs. + for i in range(len(nodes)): # pylint: disable=C0200 + node = nodes[i] + if node is None or not (node.attribute): # pylint: disable=C0325 + continue + nodes[i] = _apply_remove_node_fct_node( + onnx_remove_node_identity, + node, recursive=True, debug_info=debug_info + [node.name]) + + # Finally create the new graph. + nodes = list(filter(lambda n: n is not None, nodes)) + graph = make_graph(nodes, onnx_model.name, + onnx_model.input, onnx_model.output, + onnx_model.initializer) + + graph.value_info.extend(onnx_model.value_info) # pylint: disable=E1101 + return graph diff --git a/mlprodict/onnx_tools/optim/onnx_optimisation_redundant.py b/mlprodict/onnx_tools/optim/onnx_optimisation_redundant.py index d47c7fce1..4c91fe92c 100644 --- a/mlprodict/onnx_tools/optim/onnx_optimisation_redundant.py +++ b/mlprodict/onnx_tools/optim/onnx_optimisation_redundant.py @@ -1,174 +1,175 @@ -""" -@file -@brief Optimisation of :epkg:`ONNX` graphs. -""" -import copy -import hashlib -from onnx.helper import make_graph -from ._onnx_optimisation_common import ( # pylint: disable=E0611 - _rename_node_input, - _rename_node_output, - _apply_optimisation_on_graph, - _apply_remove_node_fct_node -) - - -def _hash_obj_content(obj, max_size=1000): - """ - Hash the content of an object. - """ - m = hashlib.sha256() - if hasattr(obj, 'op_type'): - # An operator. - m.update(obj.op_type.encode('ascii')) - m.update(len(obj.output).to_bytes(8, byteorder='big')) - for i in obj.input: - m.update(i.encode('ascii')) - if hasattr(obj, 'attribute'): - for att in obj.attribute: - m.update(att.name.encode('ascii')) - m.update(_hash_obj_content(att)) - else: - # An initializer. - obj = copy.deepcopy(obj) - obj.name = "" - obj.doc_string = "" - m.update(obj.SerializeToString()) - - content = m.digest() - if len(content) > max_size: - content = content[:max_size] - return content - - -def onnx_remove_node_redundant(onnx_model, recursive=True, debug_info=None, - max_hash_size=1000, **options): - """ - Removes redundant part of the graph. A redundant part is - a set of nodes which takes the same inputs and produces - the same outputs. It first starts by looking into duplicated - initializers, then looks into nodes taking the same inputs - and sharing the same type and parameters. - - @param onnx_model onnx model - @param recursive looks into subgraphs - @param debug_info debug information (private) - @param max_hash_size limit the size of a hash used to detect - identical subgraphs - @param options additional options (unused) - @return new onnx _model - """ - if debug_info is None: - debug_info = [str(type(onnx_model)).split('.')[-1].strip("'>")] - else: - debug_info = debug_info + \ - [str(type(onnx_model)).split('.')[-1].strip("'>")] - - if hasattr(onnx_model, 'graph'): - return _apply_optimisation_on_graph( - onnx_remove_node_redundant, onnx_model, - recursive=recursive, debug_info=debug_info, - max_hash_size=max_hash_size, **options) - - def _enumerate_rename_list_nodes_inputs(nodes, rename): - for i, node in enumerate(nodes): - if node is None: - yield False, i, None - continue - if any(set(node.input) & set(rename)): - yield True, i, _rename_node_input(node, rename) - continue - yield False, i, node - - graph = onnx_model - - # Detects duplicated initializers. - hashes = {} - names = [] - rename = {} - for init in graph.initializer: - hs = _hash_obj_content(init, max_size=max_hash_size) - if hs in hashes: - # Already seen. - rename[init.name] = hashes[hs] # pragma: no cover - else: - # New. - hashes[hs] = init.name - names.append(init.name) - - new_inits = [init for init in graph.initializer if init.name in set(names)] - - # Renames node inputs. - new_nodes = [] - new_nodes = list(graph.node) - new_nodes = list( - _[2] for _ in _enumerate_rename_list_nodes_inputs(new_nodes, rename)) - - # Detects duplicated operators. - graph_outputs = set(o.name for o in graph.output) - node_hashes = {} - changed = 1 - replace = {} - while changed > 0: - changed = 0 - nnodes = len(new_nodes) - for i in range(nnodes): - if i in replace: - # Already removed. - continue - node = new_nodes[i] - hash = _hash_obj_content(node, max_size=max_hash_size) - if hash in node_hashes: - ni = node_hashes[hash] - if ni == i: - continue - replace[i] = ni - changed += 1 - - # Specifies what to rename. - # One exception: the output is one of the graph output. - rep = new_nodes[ni] - for old, nn in zip(node.output, rep.output): - if old in graph_outputs: - rename[nn] = old - new_nodes[ni] = _rename_node_output( - new_nodes[ni], nn, old) - else: - rename[old] = nn - - # Renames inputs. - new_new_nodes = [] - renew_index = set() - for changed, ci, node in _enumerate_rename_list_nodes_inputs(new_nodes, rename): - if changed: - renew_index.add(ci) - new_new_nodes.append(node) - new_nodes = new_new_nodes - - # Renews hashes. - renew_hash = set( - k for k, v in node_hashes.items() if v in renew_index) - for hs in renew_hash: - del node_hashes[hs] - new_nodes[i] = None - else: - node_hashes[hash] = i - - if recursive: - # Handles subgraphs. - for i in range(len(new_nodes)): # pylint: disable=C0200 - node = new_nodes[i] - if node is None or not (node.attribute): # pylint: disable=C0325 - continue - new_nodes[i] = _apply_remove_node_fct_node( - onnx_remove_node_redundant, - node, recursive=True, debug_info=debug_info + [node.name]) - - # Finally create the new graph. - nodes = list(filter(lambda n: n is not None, new_nodes)) - graph = make_graph(nodes, onnx_model.name, - onnx_model.input, onnx_model.output, - new_inits) - - graph.value_info.extend(onnx_model.value_info) # pylint: disable=E1101 - return graph +""" +@file +@brief Optimisation of :epkg:`ONNX` graphs. +""" +import copy +import hashlib +from onnx.helper import make_graph +from ._onnx_optimisation_common import ( # pylint: disable=E0611 + _rename_node_input, + _rename_node_output, + _apply_optimisation_on_graph, + _apply_remove_node_fct_node +) + + +def _hash_obj_content(obj, max_size=1000): + """ + Hash the content of an object. + """ + m = hashlib.sha256() + if hasattr(obj, 'op_type'): + # An operator. + m.update(obj.op_type.encode('ascii')) + m.update(len(obj.output).to_bytes(8, byteorder='big')) + for i in obj.input: + m.update(i.encode('ascii')) + if hasattr(obj, 'attribute'): + for att in obj.attribute: + m.update(att.name.encode('ascii')) + m.update(_hash_obj_content(att)) + else: + # An initializer. + obj = copy.deepcopy(obj) + obj.name = "" + obj.doc_string = "" + m.update(obj.SerializeToString()) + + content = m.digest() + if len(content) > max_size: + content = content[:max_size] + return content + + +def onnx_remove_node_redundant(onnx_model, recursive=True, debug_info=None, + max_hash_size=1000, **options): + """ + Removes redundant part of the graph. A redundant part is + a set of nodes which takes the same inputs and produces + the same outputs. It first starts by looking into duplicated + initializers, then looks into nodes taking the same inputs + and sharing the same type and parameters. + + @param onnx_model onnx model + @param recursive looks into subgraphs + @param debug_info debug information (private) + @param max_hash_size limit the size of a hash used to detect + identical subgraphs + @param options additional options (unused) + @return new onnx _model + """ + if debug_info is None: + debug_info = [str(type(onnx_model)).rsplit( + '.', maxsplit=1)[-1].strip("'>")] + else: + debug_info = (debug_info + + [str(type(onnx_model)).rsplit('.', maxsplit=1)[-1].strip("'>")]) + + if hasattr(onnx_model, 'graph'): + return _apply_optimisation_on_graph( + onnx_remove_node_redundant, onnx_model, + recursive=recursive, debug_info=debug_info, + max_hash_size=max_hash_size, **options) + + def _enumerate_rename_list_nodes_inputs(nodes, rename): + for i, node in enumerate(nodes): + if node is None: + yield False, i, None + continue + if any(set(node.input) & set(rename)): + yield True, i, _rename_node_input(node, rename) + continue + yield False, i, node + + graph = onnx_model + + # Detects duplicated initializers. + hashes = {} + names = [] + rename = {} + for init in graph.initializer: + hs = _hash_obj_content(init, max_size=max_hash_size) + if hs in hashes: + # Already seen. + rename[init.name] = hashes[hs] # pragma: no cover + else: + # New. + hashes[hs] = init.name + names.append(init.name) + + new_inits = [init for init in graph.initializer if init.name in set(names)] + + # Renames node inputs. + new_nodes = [] + new_nodes = list(graph.node) + new_nodes = list( + _[2] for _ in _enumerate_rename_list_nodes_inputs(new_nodes, rename)) + + # Detects duplicated operators. + graph_outputs = set(o.name for o in graph.output) + node_hashes = {} + changed = 1 + replace = {} + while changed > 0: + changed = 0 + nnodes = len(new_nodes) + for i in range(nnodes): + if i in replace: + # Already removed. + continue + node = new_nodes[i] + hash = _hash_obj_content(node, max_size=max_hash_size) + if hash in node_hashes: + ni = node_hashes[hash] + if ni == i: + continue + replace[i] = ni + changed += 1 + + # Specifies what to rename. + # One exception: the output is one of the graph output. + rep = new_nodes[ni] + for old, nn in zip(node.output, rep.output): + if old in graph_outputs: + rename[nn] = old + new_nodes[ni] = _rename_node_output( + new_nodes[ni], nn, old) + else: + rename[old] = nn + + # Renames inputs. + new_new_nodes = [] + renew_index = set() + for changed, ci, node in _enumerate_rename_list_nodes_inputs(new_nodes, rename): + if changed: + renew_index.add(ci) + new_new_nodes.append(node) + new_nodes = new_new_nodes + + # Renews hashes. + renew_hash = set( + k for k, v in node_hashes.items() if v in renew_index) + for hs in renew_hash: + del node_hashes[hs] + new_nodes[i] = None + else: + node_hashes[hash] = i + + if recursive: + # Handles subgraphs. + for i in range(len(new_nodes)): # pylint: disable=C0200 + node = new_nodes[i] + if node is None or not (node.attribute): # pylint: disable=C0325 + continue + new_nodes[i] = _apply_remove_node_fct_node( + onnx_remove_node_redundant, + node, recursive=True, debug_info=debug_info + [node.name]) + + # Finally create the new graph. + nodes = list(filter(lambda n: n is not None, new_nodes)) + graph = make_graph(nodes, onnx_model.name, + onnx_model.input, onnx_model.output, + new_inits) + + graph.value_info.extend(onnx_model.value_info) # pylint: disable=E1101 + return graph diff --git a/mlprodict/onnx_tools/optim/onnx_optimisation_unused.py b/mlprodict/onnx_tools/optim/onnx_optimisation_unused.py index 8f6f2159a..8dd2452b8 100644 --- a/mlprodict/onnx_tools/optim/onnx_optimisation_unused.py +++ b/mlprodict/onnx_tools/optim/onnx_optimisation_unused.py @@ -1,81 +1,82 @@ -""" -@file -@brief Optimisation of :epkg:`ONNX` graphs. -""" -from onnx.helper import make_graph -from ._onnx_optimisation_common import ( # pylint: disable=E0611 - _apply_optimisation_on_graph, _apply_remove_node_fct_node) - - -def onnx_remove_node_unused(onnx_model, recursive=True, debug_info=None, **options): - """ - Removes unused nodes of the graph. An unused node - is not involved in the output computation. - - @param onnx_model onnx model - @param recursive looks into subgraphs - @param debug_info debug information (private) - @param options unused - @return new onnx _model - """ - if debug_info is None: - debug_info = [str(type(onnx_model)).split('.')[-1].strip("'>")] - else: - debug_info = debug_info + \ - [str(type(onnx_model)).split('.')[-1].strip("'>")] - - if hasattr(onnx_model, 'graph'): - return _apply_optimisation_on_graph( - onnx_remove_node_unused, onnx_model, - recursive=recursive, debug_info=debug_info, - **options) - - graph = onnx_model - data = {} - valid = {} - edges = {} - - for init in graph.initializer: - data[init.name, 0] = init - - for node in graph.node: - data[node.name, 1] = node - for inp in node.input: - data[inp, 0] = node - edges[(inp, 0), (node.name, 1)] = node - for out in node.output: - data[out, 0] = node - edges[(node.name, 1), (out, 0)] = node - - for out in graph.output: - valid[out.name, 0] = True - - modif = 1 - while modif > 0: - modif = 0 - for e1, e2 in edges: # pylint: disable=E1141 - if valid.get(e2, False) and not valid.get(e1, False): - valid[e1] = True - modif += 1 - - new_nodes = [n for n in graph.node if (n.name, 1) in valid] - new_inits = [n for n in graph.initializer if (n.name, 0) in valid] - - if recursive: - # Handles subgraphs. - for i in range(len(new_nodes)): # pylint: disable=C0200 - node = new_nodes[i] - if node is None or not (node.attribute): # pylint: disable=C0325 - continue - new_nodes[i] = _apply_remove_node_fct_node( - onnx_remove_node_unused, - node, recursive=True, debug_info=debug_info + [node.name]) - - # Finally create the new graph. - nodes = list(filter(lambda n: n is not None, new_nodes)) - graph = make_graph(nodes, onnx_model.name, - onnx_model.input, onnx_model.output, - new_inits) - - graph.value_info.extend(onnx_model.value_info) # pylint: disable=E1101 - return graph +""" +@file +@brief Optimisation of :epkg:`ONNX` graphs. +""" +from onnx.helper import make_graph +from ._onnx_optimisation_common import ( # pylint: disable=E0611 + _apply_optimisation_on_graph, _apply_remove_node_fct_node) + + +def onnx_remove_node_unused(onnx_model, recursive=True, debug_info=None, **options): + """ + Removes unused nodes of the graph. An unused node + is not involved in the output computation. + + @param onnx_model onnx model + @param recursive looks into subgraphs + @param debug_info debug information (private) + @param options unused + @return new onnx _model + """ + if debug_info is None: + debug_info = [str(type(onnx_model)).rsplit( + '.', maxsplit=1)[-1].strip("'>")] + else: + debug_info = (debug_info + + [str(type(onnx_model)).rsplit('.', maxsplit=1)[-1].strip("'>")]) + + if hasattr(onnx_model, 'graph'): + return _apply_optimisation_on_graph( + onnx_remove_node_unused, onnx_model, + recursive=recursive, debug_info=debug_info, + **options) + + graph = onnx_model + data = {} + valid = {} + edges = {} + + for init in graph.initializer: + data[init.name, 0] = init + + for node in graph.node: + data[node.name, 1] = node + for inp in node.input: + data[inp, 0] = node + edges[(inp, 0), (node.name, 1)] = node + for out in node.output: + data[out, 0] = node + edges[(node.name, 1), (out, 0)] = node + + for out in graph.output: + valid[out.name, 0] = True + + modif = 1 + while modif > 0: + modif = 0 + for e1, e2 in edges: # pylint: disable=E1141 + if valid.get(e2, False) and not valid.get(e1, False): + valid[e1] = True + modif += 1 + + new_nodes = [n for n in graph.node if (n.name, 1) in valid] + new_inits = [n for n in graph.initializer if (n.name, 0) in valid] + + if recursive: + # Handles subgraphs. + for i in range(len(new_nodes)): # pylint: disable=C0200 + node = new_nodes[i] + if node is None or not (node.attribute): # pylint: disable=C0325 + continue + new_nodes[i] = _apply_remove_node_fct_node( + onnx_remove_node_unused, + node, recursive=True, debug_info=debug_info + [node.name]) + + # Finally create the new graph. + nodes = list(filter(lambda n: n is not None, new_nodes)) + graph = make_graph(nodes, onnx_model.name, + onnx_model.input, onnx_model.output, + new_inits) + + graph.value_info.extend(onnx_model.value_info) # pylint: disable=E1101 + return graph diff --git a/mlprodict/onnxrt/onnx_inference.py b/mlprodict/onnxrt/onnx_inference.py index 7b71743d3..0c741992e 100644 --- a/mlprodict/onnxrt/onnx_inference.py +++ b/mlprodict/onnxrt/onnx_inference.py @@ -726,7 +726,8 @@ def dispsimple(arr): if k not in keys and k not in printed: printed.add(k) name = list( - name for name in self._global_index if self._global_index[name] == k) + name for name in self._global_index # pylint: disable=C0206 + if self._global_index[name] == k) if isinstance(values[k], (numpy.ndarray, coo_matrix)): name = name[0] mini = numpy_min(values[k]) diff --git a/mlprodict/onnxrt/ops_cpu/op_cdist.py b/mlprodict/onnxrt/ops_cpu/op_cdist.py index d9cd896e6..6137c86aa 100644 --- a/mlprodict/onnxrt/ops_cpu/op_cdist.py +++ b/mlprodict/onnxrt/ops_cpu/op_cdist.py @@ -35,7 +35,7 @@ def _find_custom_operator_schema(self, op_name): raise RuntimeError( # pragma: no cover "Unable to find a schema for operator '{}'.".format(op_name)) - def _infer_shapes(self, a, b): # pylint: disable=W0221 + def _infer_shapes(self, a, b): # pylint: disable=W0221,W0237 """ Returns the same for the labels and the probabilities. """ diff --git a/mlprodict/onnxrt/ops_cpu/op_solve.py b/mlprodict/onnxrt/ops_cpu/op_solve.py index 81f1994a4..142216dac 100644 --- a/mlprodict/onnxrt/ops_cpu/op_solve.py +++ b/mlprodict/onnxrt/ops_cpu/op_solve.py @@ -31,10 +31,10 @@ def _run(self, a, b): # pylint: disable=W0221 transposed=self.transposed), ) return (solve(a, b, lower=self.lower, transposed=self.transposed), ) - def _infer_shapes(self, a, b): # pylint: disable=W0221 + def _infer_shapes(self, a, b): # pylint: disable=W0221,W0237 return (b, ) - def _infer_types(self, a, b): # pylint: disable=W0221 + def _infer_types(self, a, b): # pylint: disable=W0221,W0237 return (b, ) def to_python(self, inputs): diff --git a/mlprodict/sklapi/onnx_transformer.py b/mlprodict/sklapi/onnx_transformer.py index 71c1298e2..c84d0f255 100644 --- a/mlprodict/sklapi/onnx_transformer.py +++ b/mlprodict/sklapi/onnx_transformer.py @@ -169,7 +169,7 @@ def transform(self, X, y=None, **inputs): if len(self.inputs_) == 1: rt_inputs[self.inputs_[0]] = numpy.array(X) else: - for i in range(len(self.inputs_)): + for i in range(len(self.inputs_)): # pylint: disable=C0200 rt_inputs[self.inputs_[i]] = [row[i] for row in X] for k, v in inputs.items(): diff --git a/mlprodict/testing/einsum/einsum_impl_classes.py b/mlprodict/testing/einsum/einsum_impl_classes.py index 019a23ea2..062b1c2f6 100644 --- a/mlprodict/testing/einsum/einsum_impl_classes.py +++ b/mlprodict/testing/einsum/einsum_impl_classes.py @@ -1334,7 +1334,7 @@ def _replace_node_sequence(self, added, deleted): self._nodes[id(added)] = added for op in forward[key]: new_inputs = list(op.inputs) - for i in range(len(op.inputs)): + for i in range(len(op.inputs)): # pylint: disable=C0200 if id(op.inputs[i]) == key: new_inputs[i] = added op.inputs = tuple(new_inputs) @@ -1348,7 +1348,7 @@ def _replace_node_sequence(self, added, deleted): inp = inps[0] for op in forward[key]: new_inputs = list(op.inputs) - for i in range(len(op.inputs)): + for i in range(len(op.inputs)): # pylint: disable=C0200 if id(op.inputs[i]) == key: new_inputs[i] = inp op.inputs = tuple(new_inputs) diff --git a/mlprodict/testing/test_utils/quantized_tensor.py b/mlprodict/testing/test_utils/quantized_tensor.py index 7bff36211..3dec58aba 100644 --- a/mlprodict/testing/test_utils/quantized_tensor.py +++ b/mlprodict/testing/test_utils/quantized_tensor.py @@ -135,7 +135,7 @@ def test_qlinear_conv(x: QuantizedTensor, x_shape, 'y_scale': y.scale_, 'y_zero_point': y.zero_point_, 'b': b.quantized_} - for k in inputs: + for k in inputs: # pylint: disable=C0206 v = inputs[k] if len(v.shape) == 0: inputs[k] = numpy.array([v], dtype=v.dtype) diff --git a/mlprodict/tools/filename_helper.py b/mlprodict/tools/filename_helper.py index be54e35ad..cc7ce008c 100644 --- a/mlprodict/tools/filename_helper.py +++ b/mlprodict/tools/filename_helper.py @@ -75,7 +75,7 @@ def extract_information_from_filename(name): else: res['opt'] = res.get('opt', '') + '_' + v - for k in res: + for k in res: # pylint: disable=C0206 if isinstance(res[k], str): res[k] = res[k].strip('_') From 0bc1f4c61a4523f95abffb25d2d8f587002c104f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Thu, 1 Jul 2021 11:06:00 +0200 Subject: [PATCH 3/3] fix type issue --- _unittests/ut_onnxrt/test_onnxrt_validate_bug.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_unittests/ut_onnxrt/test_onnxrt_validate_bug.py b/_unittests/ut_onnxrt/test_onnxrt_validate_bug.py index 36d5da2e4..e48a61f81 100644 --- a/_unittests/ut_onnxrt/test_onnxrt_validate_bug.py +++ b/_unittests/ut_onnxrt/test_onnxrt_validate_bug.py @@ -60,7 +60,7 @@ def test_dict_vectorizer_rfr(self): x = {k: numpy.float32(v) for k, v in x.items()} oinf = OnnxInference(model_onnx, runtime='python') - res3 = oinf.run({input_name: [x]}) # , verbose=10, fLOG=print) + res3 = oinf.run({input_name: numpy.array([x])}) # , verbose=10, fLOG=print) self.assertEqualFloat(res[0][0, 0], res2["variable1"][0, 0]) self.assertEqualFloat(res[0][0, 0], res3["variable1"][0])