fix a few things, add a command line

sdpython · Jul 12, 2019 · d526045 · d526045
1 parent 68060ef
commit d526045
Show file tree

Hide file tree

Showing 6 changed files with 267 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -287,3 +287,4 @@ _doc/sphinxdoc/source/*.xlsx
 *.xlsx
 _doc/sphinxdoc/source/api/onnxrt_ops.rst
 _doc/notebooks/model_onnx.onnx
+*.onnx
diff --git a/_unittests/ut_cli/test_cli_convert_validate.py b/_unittests/ut_cli/test_cli_convert_validate.py
@@ -0,0 +1,93 @@
+"""
+@brief      test tree node (time=4s)
+"""
+import os
+import unittest
+import pickle
+import pandas
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from pyquickhelper.loghelper import BufferedPrint
+from pyquickhelper.pycode import ExtTestCase, get_temp_folder
+from mlprodict.__main__ import main
+from mlprodict.cli import convert_validate
+
+
+class TestCliConvertValidate(ExtTestCase):
+
+    def test_cli_convert_validate(self):
+        st = BufferedPrint()
+        main(args=["convert_validate", "--help"], fLOG=st.fprint)
+        res = str(st)
+        self.assertIn("verbose", res)
+
+    def test_convert_validate(self):
+        iris = load_iris()
+        X, y = iris.data, iris.target
+        X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
+        clr = LogisticRegression()
+        clr.fit(X_train, y_train)
+
+        temp = get_temp_folder(__file__, "temp_convert_validate")
+        data = os.path.join(temp, "data.csv")
+        pandas.DataFrame(X_test).to_csv(data, index=False)
+        pkl = os.path.join(temp, "model.pkl")
+        with open(pkl, "wb") as f:
+            pickle.dump(clr, f)
+
+        res = convert_validate(pkl=pkl, data=data, verbose=0,
+                               method="predict,predict_proba",
+                               name="output_label,output_probability")
+        self.assertIsInstance(res, dict)
+        self.assertLess(res['metrics'][0], 1e-5)
+        self.assertLess(res['metrics'][1], 1e-5)
+
+        self.assertRaise(lambda: convert_validate(pkl=pkl, data=data, verbose=0,
+                                                  method="predict,predict_proba",
+                                                  name="output_label"), ValueError)
+        self.assertRaise(lambda: convert_validate(pkl=pkl, data=data, verbose=0,
+                                                  method="predict,predict_probas",
+                                                  name="output_label,output_probability"), AttributeError)
+        self.assertRaise(lambda: convert_validate(pkl=pkl, data=data, verbose=0,
+                                                  method="predict,predict_proba",
+                                                  name="output_label,output_probabilities"), KeyError)
+
+        res = convert_validate(pkl=pkl, data=data, verbose=0,
+                               method="predict,predict_proba",
+                               name="output_label,output_probability",
+                               noshape=True)
+        self.assertIsInstance(res, dict)
+        self.assertLess(res['metrics'][0], 1e-5)
+        self.assertLess(res['metrics'][1], 1e-5)
+
+    def test_cli_convert_validater_run(self):
+        iris = load_iris()
+        X, y = iris.data, iris.target
+        X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
+        clr = LogisticRegression()
+        clr.fit(X_train, y_train)
+
+        temp = get_temp_folder(__file__, "temp_cli_convert_validate")
+        data = os.path.join(temp, "data.csv")
+        pandas.DataFrame(X_test).to_csv(data, index=False)
+        pkl = os.path.join(temp, "model.pkl")
+        with open(pkl, "wb") as f:
+            pickle.dump(clr, f)
+
+        res = convert_validate(pkl=pkl, data=data, verbose=0,
+                               method="predict,predict_proba",
+                               name="output_label,output_probability")
+        st = BufferedPrint()
+        args = ["convert_validate", "--pkl", pkl, '--data', data,
+                '--method', "predict,predict_proba",
+                '--name', "output_label,output_probability",
+                '--verbose', '1']
+        main(args, fLOG=st.fprint)
+        res = str(st)
+        self.assertIn(
+            "[convert_validate] compute predictions with method 'predict_proba'", res)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/mlprodict/__main__.py b/mlprodict/__main__.py
@@ -17,10 +17,13 @@ def main(args, fLOG=print):
     """
     try:
         from .cli.validate import validate_runtime
+        from .cli.convert_validate import convert_validate
     except ImportError:
         from mlprodict.cli.validate import validate_runtime
+        from mlprodict.cli.convert_validate import convert_validate
 
-    fcts = dict(validate_runtime=validate_runtime)
+    fcts = dict(validate_runtime=validate_runtime,
+                convert_validate=convert_validate)
     from pyquickhelper.cli import cli_main_helper
     return cli_main_helper(fcts, args=args, fLOG=fLOG)
 

diff --git a/mlprodict/cli/__init__.py b/mlprodict/cli/__init__.py
@@ -2,5 +2,5 @@
 @file
 @brief Shortcut to *cli*.
 """
-
+from .convert_validate import convert_validate
 from .validate import validate_runtime
diff --git a/mlprodict/cli/convert_validate.py b/mlprodict/cli/convert_validate.py
@@ -0,0 +1,167 @@
+"""
+@file
+@brief Command line about validation of prediction runtime.
+"""
+import os
+import pickle
+from logging import getLogger
+import warnings
+import numpy
+from pandas import read_csv
+from skl2onnx.common.data_types import FloatTensorType
+from skl2onnx import to_onnx
+from ..onnxrt import OnnxInference
+from ..onnxrt.validate_difference import measure_relative_difference
+
+
+def convert_validate(pkl, data, method="predict",
+                     name='Y', outonnx="model.onnx",
+                     runtime='python', metric="l1med",
+                     noshape=False, fLOG=print, verbose=1):
+    """
+    Converts a model stored in *pkl* file and measure the differences
+    between the model and the ONNX predictions.
+
+    :param pkl: pickle file
+    :param data: data file, loaded with pandas,
+        converted to a single array
+    :param method: method to call
+    :param name: output name
+    :param outonnx: produced ONNX model
+    :param runtime: runtime to use to compute predictions,
+        'python', 'onnxruntime1' or 'onnxruntime2'
+    :param metric: the metric 'l1med' is given by function
+        :func:`measure_relative_difference
+        <mlprodict.onnxrt.validate_difference.measure_relative_difference>`
+    :param noshape: run the conversion with no shape information
+    :param verbose: verbose level
+    :param fLOG: logging function
+    :return: a dictionary with the results
+
+    .. cmdref::
+        :title: Convert and compare an ONNX file
+        :cmd: -m mlprodict convert_validate --help
+        :lid: l-cmd-convert_validate
+
+        The command converts and validates a :epkg:`scikit-learn` model.
+        An example to check the prediction of a logistic regression.
+
+        ::
+
+            import os
+            import pickle
+            import pandas
+            from sklearn.datasets import load_iris
+            from sklearn.model_selection import train_test_split
+            from sklearn.linear_model import LogisticRegression
+            from mlprodict.__main__ import main
+            from mlprodict.cli import convert_validate
+
+            iris = load_iris()
+            X, y = iris.data, iris.target
+            X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11)
+            clr = LogisticRegression()
+            clr.fit(X_train, y_train)
+
+            pandas.DataFrame(X_test).to_csv("data.csv", index=False)
+            with open("model.pkl", "wb") as f:
+                pickle.dump(clr, f)
+
+        And the command line to check the predictions
+        using a command line.
+
+        ::
+
+            convert_validate --pkl model.pkl --data data.csv
+                             --method predict,predict_proba
+                             --name output_label,output_probability
+                             --verbose 1
+    """
+    if fLOG is None:
+        verbose = 0
+    if verbose == 0:
+        logger = getLogger('skl2onnx')
+        logger.disabled = True
+    if not os.path.exists(pkl):
+        raise FileNotFoundError("Unable to find model '{}'.".format(pkl))
+    if not os.path.exists(data):
+        raise FileNotFoundError("Unable to find data '{}'.".format(data))
+    if os.path.exists(outonnx):
+        warnings.warn("File '{}' will be overwritten.".format(outonnx))
+    if verbose > 0:
+        fLOG("[convert_validate] load model '{}'".format(pkl))
+    with open(pkl, "rb") as f:
+        model = pickle.load(f)
+    if verbose > 0:
+        fLOG("[convert_validate] load data '{}'".format(data))
+    df = read_csv(data)
+    if verbose > 0:
+        fLOG("[convert_validate] convert data into matrix")
+    numerical = df.values.astype(numpy.float32)
+    if noshape:
+        if verbose > 0:
+            fLOG("[convert_validate] convert the model with no shape information")
+        onx = to_onnx(model, initial_types=[
+                      ('X', FloatTensorType(['dim1', 'dim2']))])
+    else:
+        if verbose > 0:
+            fLOG("[convert_validate] convert the model with shapes")
+        onx = to_onnx(model, numerical)
+    if verbose > 0:
+        fLOG("[convert_validate] saves to '{}'".format(outonnx))
+    memory = onx.SerializeToString()
+    with open(outonnx, 'wb') as f:
+        f.write(memory)
+
+    if verbose > 0:
+        fLOG("[convert_validate] creates OnnxInference session")
+    sess = OnnxInference(onx, runtime=runtime)
+
+    if verbose > 0:
+        fLOG("[convert_validate] compute prediction from model")
+
+    if ',' in method:
+        methods = method.split(',')
+    else:
+        methods = [method]
+    if ',' in name:
+        names = name.split(',')
+    else:
+        names = [name]
+
+    if len(names) != len(methods):
+        raise ValueError("Number of methods and outputs do not match: {}, {}".format(
+            names, methods))
+
+    if metric != 'l1med':
+        raise ValueError("Unknown metric '{}'".format(metric))
+
+    if verbose > 0:
+        fLOG("[convert_validate] compute predictions from ONNX with name '{}'".format(
+            name))
+    ort_preds = sess.run({'X': numerical})
+
+    metrics = []
+    out_skl_preds = []
+    out_ort_preds = []
+    for method_, name_ in zip(methods, names):
+        if verbose > 0:
+            fLOG("[convert_validate] compute predictions with method '{}'".format(
+                method_))
+        meth = getattr(model, method_)
+        skl_pred = meth(numerical)
+        out_skl_preds.append(skl_pred)
+
+        if name_ not in ort_preds:
+            raise KeyError("Unable to find output name '{}' in {}".format(
+                name_, list(sorted(ort_preds))))
+
+        ort_pred = ort_preds[name_]
+        out_ort_preds.append(ort_pred)
+        diff = measure_relative_difference(skl_pred, ort_pred)
+        if verbose > 0:
+            fLOG("[convert_validate] {}={}".format(metric, diff))
+        metrics.append(diff)
+
+    return dict(skl_pred=out_skl_preds, ort_pred=out_ort_preds,
+                metrics=metrics, onnx=memory)
diff --git a/mlprodict/cli/validate.py b/mlprodict/cli/validate.py
@@ -52,7 +52,7 @@ def validate_runtime(verbose=1, opset_min=9, opset_max="",
     :param fLOG: logging function
 
     .. cmdref::
-        :title: Valide a runtime against scikit-learn
+        :title: Validate a runtime against scikit-learn
         :cmd: -m mlprodict validate_runtime --help
         :lid: l-cmd-validate_runtime