diff --git a/_unittests/ut_onnx_conv/test_onnx_conv_dataframe.py b/_unittests/ut_onnx_conv/test_onnx_conv_dataframe.py index 710c5c05e..0d64e5668 100644 --- a/_unittests/ut_onnx_conv/test_onnx_conv_dataframe.py +++ b/_unittests/ut_onnx_conv/test_onnx_conv_dataframe.py @@ -10,7 +10,9 @@ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer -from mlprodict.onnx_conv import to_onnx +from skl2onnx.common.data_types import Int64TensorType +from mlprodict.onnx_conv import ( + to_onnx, guess_schema_from_data, get_inputs_from_data) from mlprodict.onnxrt import OnnxInference @@ -32,7 +34,10 @@ def test_pipeline_dataframe_case3(self): def test_pipeline_dataframe_case4(self): self.case_test_pipeline_dataframe(4) - def case_test_pipeline_dataframe(self, case): + def test_pipeline_dataframe_case4_cat(self): + self.case_test_pipeline_dataframe(4, cat=True) + + def case_test_pipeline_dataframe(self, case, cat=False): text = """ fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color 7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red @@ -88,8 +93,15 @@ def case_test_pipeline_dataframe(self, case): else: raise NotImplementedError() - pipe.fit(X_train) + if cat: + X_train['color'] = X_train['color'].astype('category') + schema = guess_schema_from_data(X_train) + if isinstance(schema[-1][-1], Int64TensorType): + raise AssertionError( + "Issue with type of last column %r: %r." % ( + schema[-1], X_train.dtypes[-1])) + pipe.fit(X_train) model_onnx = to_onnx(pipe, X_train) try: oinf = OnnxInference(model_onnx) @@ -98,8 +110,7 @@ def case_test_pipeline_dataframe(self, case): case, e)) from e pred = pipe.transform(X_train) - inputs = {c: X_train[c].values for c in X_train.columns} - inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()} + inputs = get_inputs_from_data(X_train) onxp = oinf.run(inputs) got = onxp['transformed_column'] self.assertEqualArray(pred, got) diff --git a/mlprodict/onnx_conv/__init__.py b/mlprodict/onnx_conv/__init__.py index 68010b750..d9434cd28 100644 --- a/mlprodict/onnx_conv/__init__.py +++ b/mlprodict/onnx_conv/__init__.py @@ -6,4 +6,6 @@ import onnx from .register import register_converters, register_scorers from .register_rewritten_converters import register_rewritten_operators -from .convert import to_onnx, guess_schema_from_data, guess_schema_from_model +from .convert import ( + to_onnx, guess_schema_from_data, guess_schema_from_model, + get_inputs_from_data) diff --git a/mlprodict/onnx_conv/convert.py b/mlprodict/onnx_conv/convert.py index a5b34b2ee..b0081c869 100644 --- a/mlprodict/onnx_conv/convert.py +++ b/mlprodict/onnx_conv/convert.py @@ -14,9 +14,10 @@ from sklearn.metrics.scorer import _PredictScorer from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version from skl2onnx.common.data_types import ( - FloatTensorType, DoubleTensorType, DataType, guess_numpy_type) -from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin + FloatTensorType, DoubleTensorType, DataType, guess_numpy_type, + StringTensorType, Int64TensorType) from skl2onnx import convert_sklearn +from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin from skl2onnx.algebra.type_helper import _guess_type from .register_rewritten_converters import register_rewritten_operators from .register import register_converters @@ -101,7 +102,10 @@ def guess_initial_types(X, initial_types): if isinstance(X, pandas.DataFrame): initial_types = [] for c in X.columns: - g = _guess_type(X[c].values) + if isinstance(X[c].values[0], (str, numpy.str)): + g = StringTensorType() + else: + g = _guess_type(X[c].values) g.shape = [None, 1] initial_types.append((c, g)) else: @@ -148,6 +152,48 @@ def guess_schema_from_data(X, tensor_type=None, schema=None): return [('X', unique[0]([None, sum(_[1].shape[1] for _ in init)]))] +def get_inputs_from_data(X, schema=None): + """ + Produces input data for *onnx* runtime. + + @param X data + @param schema schema if None, schema is guessed with + @see fn guess_schema_from_data + @return input data + """ + def _cast_data(X, ct): + if isinstance(ct, FloatTensorType): + return X.astype(numpy.float32) + if isinstance(ct, DoubleTensorType): + return X.astype(numpy.float64) + if isinstance(ct, StringTensorType): + return X.astype(numpy.str) + if isinstance(ct, Int64TensorType): + return X.astype(numpy.int64) + raise RuntimeError( + "Unexpected column type {} for type {}." + "".format(ct, type(X))) + + if schema is None: + schema = guess_schema_from_data(X) + if isinstance(X, numpy.ndarray): + if len(schema) != 1: + raise RuntimeError( # pragma: no cover + "More than one column but input is an array.") + return {schema[0][0]: _cast_data(X, schema[0][1])} + elif isinstance(X, pandas.DataFrame): + if len(schema) != X.shape[1]: + raise RuntimeError( # pragma: no cover + "Mismatch between onnx columns {} and DataFrame columns {}" + "".format(len(schema), X.shape[1])) + return {sch[0]: _cast_data(X[c].values, sch[1]).reshape((-1, 1)) + for sch, c in zip(schema, X.columns)} + else: + raise TypeError( + "Unexpected type {}, expecting an array or a dataframe." + "".format(type(X))) + + def guess_schema_from_model(model, tensor_type=None, schema=None): """ Guesses initial types from a model.