Skip to content
This repository has been archived by the owner on Jan 13, 2024. It is now read-only.

Commit

Permalink
Fixes #112, fix number of features for kmeans when validating the run…
Browse files Browse the repository at this point in the history
…time
  • Loading branch information
sdpython committed Mar 20, 2020
1 parent 8fad5e5 commit ec96f3e
Show file tree
Hide file tree
Showing 10 changed files with 261 additions and 53 deletions.
46 changes: 46 additions & 0 deletions _unittests/ut_cli/test_cli_validate_runtime.py
@@ -0,0 +1,46 @@
"""
@brief test tree node (time=30s)
"""
import os
import unittest
import pandas
from pyquickhelper.loghelper import BufferedPrint
from pyquickhelper.pycode import ExtTestCase, get_temp_folder
from mlprodict.onnxrt.validate.validate_summary import (
merge_benchmark, summary_report)
from mlprodict.__main__ import main


class TestCliValidateRuntime(ExtTestCase):

def test_cli_validate_kmeans(self):
temp = get_temp_folder(__file__, "temp_validate_runtime_kmeans")
out1 = os.path.join(temp, "raw.csv")
out2 = os.path.join(temp, "sum.csv")
gr = os.path.join(temp, 'gr.png')
st = BufferedPrint()
main(args=["validate_runtime", "--n_features", "4,50", "-nu", "3",
"-re", "3", "-o", "11", "-op", "11", "-v", "2", "--out_raw",
out1, "--out_summary", out2, "-b", "1",
"--runtime", "python_compiled,onnxruntime1",
"--models", "KMeans", "--out_graph", gr, "--dtype", "32"],
fLOG=st.fprint)
res = str(st)
self.assertIn('KMeans', res)
self.assertExists(out1)
self.assertExists(out2)
self.assertExists(gr)
df1 = pandas.read_csv(out1)
merged = merge_benchmark({'r1-': df1, 'r2-': df1.copy()},
baseline='r1-onnxruntime1')
add_cols = list(
sorted(c for c in merged.columns if c.endswith('-base')))
suma = summary_report(merged, add_cols=add_cols)
self.assertEqual(merged.shape[0], suma.shape[0])
self.assertIn('N=10-base', suma.columns)
outdf = os.path.join(temp, "merged.xlsx")
suma.to_excel(outdf, index=False)


if __name__ == "__main__":
unittest.main()
9 changes: 9 additions & 0 deletions _unittests/ut_onnxrt/data/data112LRSW-20200319.csv

Large diffs are not rendered by default.

33 changes: 33 additions & 0 deletions _unittests/ut_onnxrt/data/dataGITLRSW-20200319.csv

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions _unittests/ut_onnxrt/test_benchmark_tools.py
Expand Up @@ -38,6 +38,12 @@ def common_test_benchmark_merge(self, f1, f2):
def test_benchmark_merge(self):
self.common_test_benchmark_merge("data.csv", "data2.csv")

def test_benchmark_merge_fail(self):
self.assertRaise(
lambda: self.common_test_benchmark_merge(
"data112LRSW-20200319.csv", "dataGITLRSW-20200319.csv"),
ValueError)


if __name__ == "__main__":
unittest.main()
61 changes: 61 additions & 0 deletions _unittests/ut_onnxrt/test_rt_valid_model_kmeans.py
@@ -0,0 +1,61 @@
"""
@brief test log(time=3s)
"""
import unittest
from logging import getLogger
from pandas import DataFrame
from pyquickhelper.loghelper import fLOG
from pyquickhelper.pycode import ExtTestCase
from pyquickhelper.pandashelper import df2rst
from sklearn.exceptions import ConvergenceWarning
try:
from sklearn.utils._testing import ignore_warnings
except ImportError:
from sklearn.utils.testing import ignore_warnings
from skl2onnx import __version__ as skl2onnx_version
from mlprodict.onnxrt.validate import enumerate_validated_operator_opsets, summary_report
from mlprodict.onnxrt.doc.doc_write_helper import split_columns_subsets


class TestRtValidateKMeans(ExtTestCase):

@ignore_warnings(category=(UserWarning, ConvergenceWarning, RuntimeWarning))
def test_rt_KMeans_python(self):
fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__")
logger = getLogger('skl2onnx')
logger.disabled = True
verbose = 2 if __name__ == "__main__" else 0

debug = False
buffer = []

def myprint(*args, **kwargs):
buffer.append(" ".join(map(str, args)))

rows = list(enumerate_validated_operator_opsets(
verbose, models={"KMeans"}, opset_min=11,
opset_max=11, fLOG=myprint,
runtime='python', debug=debug))
self.assertGreater(len(rows), 1)
self.assertIn('skl_nop', rows[-1])
keys = set()
for row in rows:
keys.update(set(row))
self.assertIn('onx_size', keys)
piv = summary_report(DataFrame(rows))
opset = [c for c in piv.columns if 'opset' in c]
self.assertTrue('opset11' in opset or 'opset10' in opset)
self.assertGreater(len(buffer), 1 if debug else 0)
common, subsets = split_columns_subsets(piv)
try:
conv = df2rst(piv, split_col_common=common, # pylint: disable=E1123
split_col_subsets=subsets)
self.assertIn('| KMeans |', conv)
except TypeError as e:
if "got an unexpected keyword argument 'split_col_common'" in str(e):
return
raise e


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion mlprodict/__init__.py
Expand Up @@ -4,7 +4,7 @@
@brief Ways to speed up predictions for a machine learned model.
"""

__version__ = "0.3.1029"
__version__ = "0.3.1037"
__author__ = "Xavier Dupré"


Expand Down
106 changes: 59 additions & 47 deletions mlprodict/onnxrt/validate/validate.py
Expand Up @@ -27,26 +27,12 @@
from .validate_helper import (
_dispsimple, sklearn_operators,
_measure_time, _shape_exc, dump_into_folder,
default_time_kwargs
default_time_kwargs, RuntimeBadResultsError,
_dictionary2str
)
from .validate_benchmark import benchmark_fct


class RuntimeBadResultsError(RuntimeError):
"""
Raised when the results are too different from
:epkg:`scikit-learn`.
"""

def __init__(self, msg, obs):
"""
@param msg to display
@param obs observations
"""
RuntimeError.__init__(self, msg)
self.obs = obs


def _get_problem_data(prob, n_features):
data_problem = _problems[prob](n_features=n_features)
if len(data_problem) == 6:
Expand All @@ -57,6 +43,10 @@ def _get_problem_data(prob, n_features):
else:
raise RuntimeError(
"Unable to interpret problem '{}'.".format(prob))
if X_.shape[1] != n_features and n_features is not None:
raise RuntimeError("Problem '{}' with n_features={} returned {} features"
"(func={}).".format(prob, n_features, X_.shape[1],
_problems[prob]))
if y_ is None:
(X_train, X_test, Xort_train, # pylint: disable=W0612
Xort_test) = train_test_split(
Expand Down Expand Up @@ -216,13 +206,6 @@ def _retrieve_problems_extra(model, verbose, fLOG, extended_list):
return problems, extras


def _dictionary2str(di):
el = []
for k in sorted(di):
el.append('{}={}'.format(k, di[k]))
return '/'.join(el)


def _merge_options(all_conv_options, aoptions):
if aoptions is None:
return copy.deepcopy(all_conv_options)
Expand Down Expand Up @@ -753,7 +736,12 @@ def fct_batch(se=sess, xo=Xort_test, it=init_types): # pylint: disable=W0102
if debug and len(debug_exc) == 2:
raise debug_exc[0] # pragma: no cover
if debug and verbose >= 2:
fLOG(pprint.pformat(obs_op))
if verbose >= 3:
fLOG(pprint.pformat(obs_op))
else:
obs_op_log = {k: v for k,
v in obs_op.items() if 'lambda-' not in k}
fLOG(pprint.pformat(obs_op_log))
if verbose >= 2 and fLOG is not None:
fLOG("[enumerate_compatible_opset-R] next...")
if dump_all:
Expand All @@ -766,6 +754,36 @@ def fct_batch(se=sess, xo=Xort_test, it=init_types): # pylint: disable=W0102
return obs_op


def _enumerate_validated_operator_opsets_ops(extended_list, models, skip_models):
ops = [_ for _ in sklearn_operators(extended=extended_list)]

if models is not None:
if not all(map(lambda m: isinstance(m, str), models)):
raise ValueError("models must be a set of strings.")
ops_ = [_ for _ in ops if _['name'] in models]
if len(ops) == 0:
raise ValueError("Parameter models is wrong: {}\n{}".format(
models, ops[0]))
ops = ops_
if skip_models is not None:
ops = [m for m in ops if m['name'] not in skip_models]
return ops


def _enumerate_validated_operator_opsets_version(runtime):
from numpy import __version__ as numpy_version
from onnx import __version__ as onnx_version
from scipy import __version__ as scipy_version
from skl2onnx import __version__ as skl2onnx_version
add_versions = {'v_numpy': numpy_version, 'v_onnx': onnx_version,
'v_scipy': scipy_version, 'v_skl2onnx': skl2onnx_version,
'v_sklearn': sklearn_version, 'v_onnxruntime': ort_version}
if "onnxruntime" in runtime:
from onnxruntime import __version__ as onnxrt_version
add_versions['v_onnxruntime'] = onnxrt_version
return add_versions


def enumerate_validated_operator_opsets(verbose=0, opset_min=-1, opset_max=-1,
check_runtime=True, debug=False, runtime='python',
models=None, dump_folder=None, store_models=False,
Expand Down Expand Up @@ -837,18 +855,8 @@ def enumerate_validated_operator_opsets(verbose=0, opset_min=-1, opset_max=-1,
"""
register_converters()
register_rewritten_operators()
ops = [_ for _ in sklearn_operators(extended=extended_list)]

if models is not None:
if not all(map(lambda m: isinstance(m, str), models)):
raise ValueError("models must be a set of strings.")
ops_ = [_ for _ in ops if _['name'] in models]
if len(ops) == 0:
raise ValueError("Parameter models is wrong: {}\n{}".format(
models, ops[0]))
ops = ops_
if skip_models is not None:
ops = [m for m in ops if m['name'] not in skip_models]
ops = _enumerate_validated_operator_opsets_ops(
extended_list, models, skip_models)

if verbose > 0:

Expand Down Expand Up @@ -880,16 +888,7 @@ def iterate_tqdm():
loop = ops

if versions:
from numpy import __version__ as numpy_version
from onnx import __version__ as onnx_version
from scipy import __version__ as scipy_version
from skl2onnx import __version__ as skl2onnx_version
add_versions = {'v_numpy': numpy_version, 'v_onnx': onnx_version,
'v_scipy': scipy_version, 'v_skl2onnx': skl2onnx_version,
'v_sklearn': sklearn_version, 'v_onnxruntime': ort_version}
if "onnxruntime" in runtime:
from onnxruntime import __version__ as onnxrt_version
add_versions['v_onnxruntime'] = onnxrt_version
add_version = _enumerate_validated_operator_opsets_version(runtime)
else:
add_versions = {}

Expand All @@ -904,6 +903,8 @@ def iterate_tqdm():
for row in loop:

model = row['cl']
if verbose > 1:
fLOG("[enumerate_validated_operator_opsets] - model='{}'".format(model))

for obs in enumerate_compatible_opset(
model, opset_min=opset_min, opset_max=opset_max,
Expand All @@ -917,8 +918,19 @@ def iterate_tqdm():
n_features=n_features, skip_long_test=skip_long_test,
filter_scenario=filter_scenario):

for mandkey in ('inst', 'method_name', 'problem',
'scenario'):
if mandkey not in obs:
raise ValueError("Missing key '{}' in\n{}".format(
mandkey, pprint.pformat(obs)))
if verbose > 1:
fLOG(" ", obs)
fLOG('[enumerate_validated_operator_opsets] - OBS')
if verbose > 2:
fLOG(" ", obs)
else:
obs_log = {k: v for k,
v in obs.items() if 'lambda-' not in k}
fLOG(" ", obs_log)
elif verbose > 0 and "_0problem_exc" in obs:
fLOG(" ???", obs)

Expand Down
22 changes: 22 additions & 0 deletions mlprodict/onnxrt/validate/validate_helper.py
Expand Up @@ -15,6 +15,28 @@
from sklearn import __all__ as sklearn__all__, __version__ as sklearn_version


class RuntimeBadResultsError(RuntimeError):
"""
Raised when the results are too different from
:epkg:`scikit-learn`.
"""

def __init__(self, msg, obs):
"""
@param msg to display
@param obs observations
"""
RuntimeError.__init__(self, msg)
self.obs = obs


def _dictionary2str(di):
el = []
for k in sorted(di):
el.append('{}={}'.format(k, di[k]))
return '/'.join(el)


def modules_list():
"""
Returns modules and versions currently used.
Expand Down
1 change: 1 addition & 0 deletions mlprodict/onnxrt/validate/validate_problems.py
Expand Up @@ -336,6 +336,7 @@ def _problem_for_clustering_scores(dtype=numpy.float32, n_features=None):
state = numpy.random.RandomState(seed=34) # pylint: disable=E1101
rnd = state.randn(*X.shape) / 3
X += rnd
X = _modify_dimension(X, n_features)
return (X, None, [('X', X[:1].astype(dtype))],
'transform', 1, X.astype(dtype))

Expand Down

0 comments on commit ec96f3e

Please sign in to comment.