# Discrepencies with ONNX

The notebook shows one example where the conversion leads with discrepencies if default options are used. It converts a pipeline with two steps, a scaler followed by a tree.

In [1]:
from jyquickhelper import add_notebook_menu
add_notebook_menu()

In [2]:
%matplotlib inline

## Data and first model

We take a random datasets with mostly integers.

In [3]:
import math
import numpy
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

X, y = make_regression(10000, 10)
X_train, X_test, y_train, y_test = train_test_split(X, y)

Xi_train, yi_train = X_train.copy(), y_train.copy()
Xi_test, yi_test = X_test.copy(), y_test.copy()
for i in range(X.shape[1]):
    Xi_train[:, i] = (Xi_train[:, i] * math.pi * 2 ** i).astype(numpy.int64)
    Xi_test[:, i] = (Xi_test[:, i] * math.pi * 2 ** i).astype(numpy.int64)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

max_depth = 5

model = Pipeline([
    ('scaler', StandardScaler()),
    ('dt', DecisionTreeRegressor(max_depth=max_depth))
])

model.fit(Xi_train, yi_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('dt', DecisionTreeRegressor(max_depth=5))])

In [5]:
model.predict(Xi_test[:5])

array([-71.38934146, 252.04760893, 114.78668953, 135.29831645,
       135.29831645])

Other models:

In [6]:
model2 = Pipeline([
    ('scaler', StandardScaler()),
    ('dt', DecisionTreeRegressor(max_depth=max_depth))
])
model3 = Pipeline([
    ('scaler', StandardScaler()),
    ('dt', DecisionTreeRegressor(max_depth=max_depth-1))
])


models = [
    ('bug', Xi_test.astype(numpy.float32), model),
    ('no scaler', Xi_test.astype(numpy.float32), 
     DecisionTreeRegressor(max_depth=max_depth).fit(Xi_train, yi_train)),
    ('float', X_test.astype(numpy.float32),
     model2.fit(X_train, y_train)),
    ('max_depth-1', X_test.astype(numpy.float32),
     model3.fit(X_train, y_train))
]

## Conversion to ONNX

In [7]:
import numpy
from mlprodict.onnx_conv import to_onnx

onx = to_onnx(model, X_train[:1].astype(numpy.float32))

In [8]:
from mlprodict.onnxrt import OnnxInference

oinfpy = OnnxInference(onx, runtime="python_compiled")
print(oinfpy)

OnnxInference(...)
    def compiled_run(dict_inputs):
        # inputs
        X = dict_inputs['X']
        (variable1, ) = n0_scaler(X)
        (variable, ) = n1_treeensembleregressor(variable1)
        return {
            'variable': variable,
        }


In [9]:
import pandas

X32 = Xi_test.astype(numpy.float32)
y_skl = model.predict(X32)

obs = [dict(runtime='sklearn', diff=0)]
for runtime in ['python', 'python_compiled', 'onnxruntime1']:
    oinf = OnnxInference(onx, runtime=runtime)
    y_onx = oinf.run({'X': X32})['variable']
    delta = numpy.abs(y_skl - y_onx.ravel())
    am = delta.argmax()
    obs.append(dict(runtime=runtime, diff=delta.max()))
    obs[-1]['v[%d]' % am] = y_onx.ravel()[am]
    obs[0]['v[%d]' % am] = y_skl.ravel()[am]

pandas.DataFrame(obs)

Unnamed: 0,runtime,diff,v[144]
0,sklearn,0.0,300.412797
1,python,1.4e-05,300.412811
2,python_compiled,1.4e-05,300.412811
3,onnxruntime1,1.4e-05,300.412811


The pipeline shows huge discrepencies. They appear for a pipeline *StandardScaler* + *DecisionTreeRegressor* applied in integer features. They disappear if floats are used, or if the scaler is removed. The bug also disappear if the tree is not big enough (max_depth=4 instread of 5).

In [10]:
obs = [dict(runtime='sklearn', diff=0, name='sklearn')]
for name, x32, mod in models:
    for runtime in ['python', 'python_compiled', 'onnxruntime1']:
        lonx = to_onnx(mod, x32[:1])
        loinf = OnnxInference(lonx, runtime=runtime)
        y_skl = mod.predict(X32)
        y_onx = loinf.run({'X': X32})['variable']
        delta = numpy.abs(y_skl - y_onx.ravel())
        am = delta.argmax()
        obs.append(dict(runtime=runtime, diff=delta.max(), name=name))
        obs[-1]['v[%d]' % am] = y_onx.ravel()[am]
        obs[0]['v[%d]' % am] = y_skl.ravel()[am]

df = pandas.DataFrame(obs)
df

Unnamed: 0,runtime,diff,name,v[144],v[5],v[52]
0,sklearn,0.0,sklearn,300.412797,-300.872881,188.019374
1,python,1.4e-05,bug,300.412811,,
2,python_compiled,1.4e-05,bug,300.412811,,
3,onnxruntime1,1.4e-05,bug,300.412811,,
4,python,1.4e-05,no scaler,300.412811,,
5,python_compiled,1.4e-05,no scaler,300.412811,,
6,onnxruntime1,1.4e-05,no scaler,300.412811,,
7,python,1.3e-05,float,,-300.872894,
8,python_compiled,1.3e-05,float,,-300.872894,
9,onnxruntime1,1.3e-05,float,,-300.872894,


In [11]:
df.pivot("runtime", "name", "diff")

name,bug,float,max_depth-1,no scaler,sklearn
runtime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
onnxruntime1,1.4e-05,1.3e-05,4e-06,1.4e-05,
python,1.4e-05,1.3e-05,4e-06,1.4e-05,
python_compiled,1.4e-05,1.3e-05,4e-06,1.4e-05,
sklearn,,,,,0.0


## Other way to converter

ONNX does not support double for TreeEnsembleRegressor but that a new operator TreeEnsembleRegressorDouble was implemented into *mlprodict*. We need to update the conversion.

In [12]:
%load_ext mlprodict

In [13]:
onx32 = to_onnx(model, X_train[:1].astype(numpy.float32))
onx64 = to_onnx(model, X_train[:1].astype(numpy.float64), 
                dtype=numpy.float64, rewrite_ops=True)
%onnxview onx64

In [14]:
X32 = Xi_test.astype(numpy.float32)
X64 = Xi_test.astype(numpy.float64)

obs = [dict(runtime='sklearn', diff=0)]
for runtime in ['python', 'python_compiled', 'onnxruntime1']:
    for name, onx, xr in [('float', onx32, X32), ('double', onx64, X64)]:
        try:
            oinf = OnnxInference(onx, runtime=runtime)
        except Exception as e:
            obs.append(dict(runtime=runtime, error=str(e), real=name))
            continue
        y_skl = model.predict(xr)
        y_onx = oinf.run({'X': xr})['variable']
        delta = numpy.abs(y_skl - y_onx.ravel())
        am = delta.argmax()
        obs.append(dict(runtime=runtime, diff=delta.max(), real=name))
        obs[-1]['v[%d]' % am] = y_onx.ravel()[am]
        obs[0]['v[%d]' % am] = y_skl.ravel()[am]

pandas.DataFrame(obs)

Unnamed: 0,runtime,diff,v[144],v[0],real,error
0,sklearn,0.0,300.412797,-71.389341,,
1,python,1.4e-05,300.412811,,float,
2,python,0.0,,-71.389341,double,
3,python_compiled,1.4e-05,300.412811,,float,
4,python_compiled,0.0,,-71.389341,double,
5,onnxruntime1,1.4e-05,300.412811,,float,
6,onnxruntime1,,,,double,Unable to create InferenceSession due to '[ONN...


We see that the use of double removes the discrepencies.

## OnnxPipeline

Another way to reduce the number of discrepencies is to use a pipeline which converts every steps into ONNX before training the next one. That way, every steps is either trained on the inputs, either trained on the outputs produced by ONNX. Let's see how it works.

In [15]:
from mlprodict.sklapi import OnnxPipeline

model_onx = OnnxPipeline([
    ('scaler', StandardScaler()),
    ('dt', DecisionTreeRegressor(max_depth=max_depth))
])
model_onx.fit(Xi_train, yi_train)



OnnxPipeline(steps=[('scaler',
                     OnnxTransformer(onnx_bytes=b'\x08\x06\x12\x08skl2onnx\x1a\x051.7.0"\x07ai.onnx(\x002\x00:\xf6\x01\n\xa6\x01\n\x01X\x12\x08variable\x1a\x06Scaler"\x06Scaler*=\n\x06offset=[\xd6j\xbd=\xc6\x01a\xbc=\xe4\xefq==\xd6\xea\x98<=\x85|\x10\xbf=\xe1\xe6\x87\xbf=l\x9d\x85?=\xd8\xcb\t?=\xeb\x07\xb2\xc0={\x14\x92A\xa0\x01\x06*<\n\x0...xbc,>=\x80.\xaa==\xa6\x1c&==g\x99\xa3<=\xcd\xb7"<=\x02\xdc\xa2;=\xc6\x9d";=\xec8\xa3:=P3":\xa0\x01\x06:\nai.onnx.ml\x12\x1emlprodict_ONNX(StandardScaler)Z\x11\n\x01X\x12\x0c\n\n\x08\x01\x12\x06\n\x00\n\x02\x08\nb\x18\n\x08variable\x12\x0c\n\n\x08\x01\x12\x06\n\x00\n\x02\x08\nB\x0e\n\nai.onnx.ml\x10\x01')),
                    ('dt', DecisionTreeRegressor(max_depth=5))])

We see that the first steps was replaced by an object *OnnxTransformer* which wraps an ONNX file into a transformer following the *scikit-learn* API. The initial steps is still available.

In [16]:
model_onx.raw_steps_

[('scaler', StandardScaler()), ('dt', DecisionTreeRegressor(max_depth=5))]

In [17]:
models = [
    ('bug', Xi_test.astype(numpy.float32), model),
    ('OnnxPipeline', Xi_test.astype(numpy.float32), model_onx),
]

In [18]:
obs = [dict(runtime='sklearn', diff=0, name='sklearn')]
for name, x32, mod in models:
    for runtime in ['python', 'python_compiled', 'onnxruntime1']:
        lonx = to_onnx(mod, x32[:1])
        loinf = OnnxInference(lonx, runtime=runtime)
        y_skl = mod.predict(X32)
        y_onx = loinf.run({'X': X32})['variable']
        delta = numpy.abs(y_skl - y_onx.ravel())
        am = delta.argmax()
        obs.append(dict(runtime=runtime, diff=delta.max(), name=name))
        obs[-1]['v[%d]' % am] = y_onx.ravel()[am]
        obs[0]['v[%d]' % am] = y_skl.ravel()[am]

df = pandas.DataFrame(obs)
df

Unnamed: 0,runtime,diff,name,v[144]
0,sklearn,0.0,sklearn,300.412797
1,python,1.4e-05,bug,300.412811
2,python_compiled,1.4e-05,bug,300.412811
3,onnxruntime1,1.4e-05,bug,300.412811
4,python,1.4e-05,OnnxPipeline,300.412811
5,python_compiled,1.4e-05,OnnxPipeline,300.412811
6,onnxruntime1,1.4e-05,OnnxPipeline,300.412811


Training the next steps based on ONNX outputs is better. This is not completely satisfactory... Let's check the accuracy.

In [19]:
model.score(Xi_test, yi_test), model_onx.score(Xi_test, yi_test)

(0.5230197935582286, 0.5230197935582286)

Pretty close.