Skip to content
This repository was archived by the owner on Jan 13, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 16 additions & 5 deletions _unittests/ut_onnx_conv/test_onnx_conv_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from mlprodict.onnx_conv import to_onnx
from skl2onnx.common.data_types import Int64TensorType
from mlprodict.onnx_conv import (
to_onnx, guess_schema_from_data, get_inputs_from_data)
from mlprodict.onnxrt import OnnxInference


Expand All @@ -32,7 +34,10 @@ def test_pipeline_dataframe_case3(self):
def test_pipeline_dataframe_case4(self):
self.case_test_pipeline_dataframe(4)

def case_test_pipeline_dataframe(self, case):
def test_pipeline_dataframe_case4_cat(self):
self.case_test_pipeline_dataframe(4, cat=True)

def case_test_pipeline_dataframe(self, case, cat=False):
text = """
fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
Expand Down Expand Up @@ -88,8 +93,15 @@ def case_test_pipeline_dataframe(self, case):
else:
raise NotImplementedError()

pipe.fit(X_train)
if cat:
X_train['color'] = X_train['color'].astype('category')
schema = guess_schema_from_data(X_train)
if isinstance(schema[-1][-1], Int64TensorType):
raise AssertionError(
"Issue with type of last column %r: %r." % (
schema[-1], X_train.dtypes[-1]))

pipe.fit(X_train)
model_onnx = to_onnx(pipe, X_train)
try:
oinf = OnnxInference(model_onnx)
Expand All @@ -98,8 +110,7 @@ def case_test_pipeline_dataframe(self, case):
case, e)) from e

pred = pipe.transform(X_train)
inputs = {c: X_train[c].values for c in X_train.columns}
inputs = {c: v.reshape((v.shape[0], 1)) for c, v in inputs.items()}
inputs = get_inputs_from_data(X_train)
onxp = oinf.run(inputs)
got = onxp['transformed_column']
self.assertEqualArray(pred, got)
Expand Down
4 changes: 3 additions & 1 deletion mlprodict/onnx_conv/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@
import onnx
from .register import register_converters, register_scorers
from .register_rewritten_converters import register_rewritten_operators
from .convert import to_onnx, guess_schema_from_data, guess_schema_from_model
from .convert import (
to_onnx, guess_schema_from_data, guess_schema_from_model,
get_inputs_from_data)
52 changes: 49 additions & 3 deletions mlprodict/onnx_conv/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
from sklearn.metrics.scorer import _PredictScorer
from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version
from skl2onnx.common.data_types import (
FloatTensorType, DoubleTensorType, DataType, guess_numpy_type)
from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin
FloatTensorType, DoubleTensorType, DataType, guess_numpy_type,
StringTensorType, Int64TensorType)
from skl2onnx import convert_sklearn
from skl2onnx.algebra.onnx_operator_mixin import OnnxOperatorMixin
from skl2onnx.algebra.type_helper import _guess_type
from .register_rewritten_converters import register_rewritten_operators
from .register import register_converters
Expand Down Expand Up @@ -101,7 +102,10 @@ def guess_initial_types(X, initial_types):
if isinstance(X, pandas.DataFrame):
initial_types = []
for c in X.columns:
g = _guess_type(X[c].values)
if isinstance(X[c].values[0], (str, numpy.str)):
g = StringTensorType()
else:
g = _guess_type(X[c].values)
g.shape = [None, 1]
initial_types.append((c, g))
else:
Expand Down Expand Up @@ -148,6 +152,48 @@ def guess_schema_from_data(X, tensor_type=None, schema=None):
return [('X', unique[0]([None, sum(_[1].shape[1] for _ in init)]))]


def get_inputs_from_data(X, schema=None):
"""
Produces input data for *onnx* runtime.

@param X data
@param schema schema if None, schema is guessed with
@see fn guess_schema_from_data
@return input data
"""
def _cast_data(X, ct):
if isinstance(ct, FloatTensorType):
return X.astype(numpy.float32)
if isinstance(ct, DoubleTensorType):
return X.astype(numpy.float64)
if isinstance(ct, StringTensorType):
return X.astype(numpy.str)
if isinstance(ct, Int64TensorType):
return X.astype(numpy.int64)
raise RuntimeError(
"Unexpected column type {} for type {}."
"".format(ct, type(X)))

if schema is None:
schema = guess_schema_from_data(X)
if isinstance(X, numpy.ndarray):
if len(schema) != 1:
raise RuntimeError( # pragma: no cover
"More than one column but input is an array.")
return {schema[0][0]: _cast_data(X, schema[0][1])}
elif isinstance(X, pandas.DataFrame):
if len(schema) != X.shape[1]:
raise RuntimeError( # pragma: no cover
"Mismatch between onnx columns {} and DataFrame columns {}"
"".format(len(schema), X.shape[1]))
return {sch[0]: _cast_data(X[c].values, sch[1]).reshape((-1, 1))
for sch, c in zip(schema, X.columns)}
else:
raise TypeError(
"Unexpected type {}, expecting an array or a dataframe."
"".format(type(X)))


def guess_schema_from_model(model, tensor_type=None, schema=None):
"""
Guesses initial types from a model.
Expand Down