From 8744e22021325adf9058020139268cd878916abe Mon Sep 17 00:00:00 2001 From: xadupre Date: Mon, 28 Nov 2022 17:03:57 +0100 Subject: [PATCH 01/10] Update plot_benchmark_eager_mode.py --- _doc/examples/plot_benchmark_eager_mode.py | 149 ++++++++++++++++++--- 1 file changed, 130 insertions(+), 19 deletions(-) diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index ab01136..5e07be9 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -20,7 +20,8 @@ It is possible to do the same with :epkg:`onnxruntime`. This example compares the performance of a couple of scenarios. This work is close to what is done in example -:ref:`benchmark-ort-api`. +:ref:`benchmark-ort-api`. The example compares the performance +of a couple of methods for CPU and GPU. .. contents:: :local: @@ -46,10 +47,16 @@ make_model, make_node, make_graph, make_tensor_value_info) from onnxruntime import ( - get_all_providers, InferenceSession, __version__ as ort_version) + get_all_providers, InferenceSession, __version__ as ort_version, + RunOptions) from onnxruntime.capi._pybind_state import ( # pylint: disable=E0611 OrtDevice as C_OrtDevice, OrtMemType, OrtValue as C_OrtValue) +try: + from onnxruntime.capi._pybind_state import OrtValueVector +except ImportError: + # You need onnxruntime>=1.14 + OrtValueVector = None from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation ############################################ @@ -143,8 +150,48 @@ def f_ort_ov(X): Z = sess_add2._sess.run_with_ort_values({'X': X}, ['Z'], None)[0] return Z +####################################### +# onnxruntime >= 1.14 introduces a vector of OrtValues +# to bypass the building of a dictionary. + + +if OrtValueVector is not None: + + run_options = RunOptions() + devices = [C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0)] + + def f_ort_vect_ov_eager(X): + "ort-vect-ov-eager" + vect_in = OrtValueVector() + vect_in.push_back(X) + vect_out = OrtValueVector() + sess_add._sess.run_with_ortvaluevector( + run_options, ["X"], vect_in, ["Z"], vect_out, devices) + vect_out2 = OrtValueVector() + sess_add._sess.run_with_ortvaluevector( + run_options, ["X"], vect_out, ["Z"], vect_out2, devices) + assert len(vect_out2) == 1 + return vect_out2[0] + + def f_ort_vect_ov(X): + "ort-vect-ov" + vect_in = OrtValueVector() + vect_in.push_back(X) + vect_out = OrtValueVector() + sess_add2._sess.run_with_ortvaluevector( + run_options, ["X"], vect_in, ["Z"], vect_out, devices) + assert len(vect_out) == 1 + return vect_out[0] + +else: + f_ort_vect_ov_eager = None + f_ort_vect_ov = None + +######################################### +# If GPU is available. if sess_add_gpu is not None: + # def f_ort_ov_eager_gpu(X): "ort-ov-eager-gpu" @@ -157,10 +204,46 @@ def f_ort_ov_gpu(X): Z = sess_add2_gpu._sess.run_with_ort_values({'X': X}, ['Z'], None)[0] return Z + if OrtValueVector is not None: + + run_options = RunOptions() + devices = [C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0)] + + def f_ort_vect_ov_eager_gpu(X): + "ort-vect-ov-eager-gpu" + vect_in = OrtValueVector() + vect_in.push_back(X) + vect_out = OrtValueVector() + sess_add._sess.run_with_ortvaluevector( + run_options, ["X"], vect_in, ["Z"], vect_out, devices) + vect_out2 = OrtValueVector() + sess_add._sess.run_with_ortvaluevector( + run_options, ["X"], vect_out, ["Z"], vect_out2, devices) + assert len(vect_out2) == 1 + return vect_out2[0] + + def f_ort_vect_ov_gpu(X): + "ort-vect-ov-gpu" + vect_in = OrtValueVector() + vect_in.push_back(X) + vect_out = OrtValueVector() + sess_add2._sess.run_with_ortvaluevector( + run_options, ["X"], vect_in, ["Z"], vect_out, devices) + assert len(vect_out) == 1 + return vect_out[0] + + else: + f_ort_vect_ov_eager = None + f_ort_vect_ov = None + else: f_ort_ov_eager_gpu = None f_ort_ov_gpu = None + +####################################### +# Let's now check all these functions produces the same results. + X = numpy.random.rand(10, CST.shape[1]).astype(CST.dtype) device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0) @@ -173,6 +256,13 @@ def f_ort_ov_gpu(X): f_ort_ov_eager(Xov), f_ort_ov(Xov), ] + +if OrtValueVector is not None: + Ys.extend([ + f_ort_vect_ov_eager(Xov), + f_ort_vect_ov(Xov), + ]) + if sess_add_gpu is not None: device_gpu = C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0) try: @@ -181,6 +271,11 @@ def f_ort_ov_gpu(X): f_ort_ov_eager_gpu(Xov_gpu), f_ort_ov_gpu(Xov_gpu), ]) + if OrtValueVector is not None: + Ys.extend([ + f_ort_vect_ov_eager_gpu(Xov), + f_ort_vect_ov_gpu(Xov), + ]) except RuntimeError: # cuda is not available sess_add_gpu = None @@ -205,6 +300,7 @@ def f_ort_ov_gpu(X): def benchmark(repeat=100): fcts = [f_numpy, f_ort_eager, f_ort, f_ort_ov_eager, f_ort_ov, + f_ort_vect_ov_eager, f_ort_vect_ov, f_ort_ov_eager_gpu, f_ort_ov_gpu] data = [] for N in tqdm([1, 2, 5, 10, 20, 50, 100, 200, 500, @@ -251,27 +347,42 @@ def benchmark(repeat=100): # ++++++ def make_graph(df): - fig, ax = plt.subplots(2, 3, figsize=(12, 8)) + + def subgraph(row, cols, title): + if "numpy" not in cols: + cols.append("numpy") + piv = piv_all[cols].copy() + piv.plot(ax=ax[row, 0], title=title, logy=True, logx=True) + piv2 = piv / piv.index.values.reshape((-1, 1)) + piv2.plot(ax=ax[row, 1], title=f"Time(s) per execution / N", logx=True) + piv3 = piv / piv["numpy"].values.reshape((-1, 1)) + piv3.plot(ax=ax[row, 2], title="Ratio against numpy (lower is better", + logy=True, logx=True) + for j in range(0, 3): + ax[row, j].legend(fontsize="x-small") + + fig, ax = plt.subplots(3, 3, figsize=(12, 8)) + fig.suptitle("Time execution Eager Add + Add") piv_all = df.pivot(index="N", columns="name", values="time") + print(piv_all.columns) + + # no gpu, no vect + subgraph(0, [c for c in piv_all.columns + if "-gpu" not in c and "-vect" not in c], + title="CPU") # no gpu - piv = piv_all[[c for c in piv_all.columns if "gpu" not in c]].copy() - piv.plot(ax=ax[0, 0], title="Time(s) per execution", logy=True, logx=True) - piv2 = piv / piv.index.values.reshape((-1, 1)) - piv2.plot(ax=ax[0, 1], title="Time(s) per execution / N", logx=True) - piv3 = piv / piv["numpy"].values.reshape((-1, 1)) - piv3.plot(ax=ax[0, 2], title="Ratio against numpy (lower is better)", - logy=True, logx=True) - - # ort value - piv = piv_all[[c for c in piv_all.columns if "ov" in c or "numpy" in c]].copy() - piv.plot(ax=ax[1, 0], title="Time(s) per execution", logy=True, logx=True) - piv2 = piv / piv.index.values.reshape((-1, 1)) - piv2.plot(ax=ax[1, 1], title="Time(s) per execution / N", logx=True) - piv3 = piv / piv["numpy"].values.reshape((-1, 1)) - piv3.plot(ax=ax[1, 2], title="Ratio against numpy (lower is better)", - logy=True, logx=True) + subgraph(1, [c for c in piv_all.columns + if "-gpu" not in c and "-ov" in c], + title="CPU, OrtValue and OrtValueVector") + + # gpu + cols = [c for c in piv_all.columns if "-gpu" in c and "-ov" in c] + subgraph(2, cols, + title="GPU, OrtValue and OrtValueVector") + fig.savefig("eager_mode_cpu.png" if len(cols) == 0 + else "eager_mode_gpu.png", dpi=250) return fig, ax From 432e13e805ed69a45d676fce9cac186cacb3f934 Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Mon, 28 Nov 2022 16:28:04 +0000 Subject: [PATCH 02/10] fix example --- _doc/examples/plot_benchmark_eager_mode.py | 70 ++++++++++++---------- 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index 5e07be9..4e87355 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -157,6 +157,7 @@ def f_ort_ov(X): if OrtValueVector is not None: + vect_out = OrtValueVector() run_options = RunOptions() devices = [C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0)] @@ -164,20 +165,19 @@ def f_ort_vect_ov_eager(X): "ort-vect-ov-eager" vect_in = OrtValueVector() vect_in.push_back(X) - vect_out = OrtValueVector() + temp_vect_out = OrtValueVector() sess_add._sess.run_with_ortvaluevector( - run_options, ["X"], vect_in, ["Z"], vect_out, devices) - vect_out2 = OrtValueVector() + run_options, ["X"], vect_in, ["Z"], temp_vect_out, devices) + assert len(temp_vect_out) == 1 sess_add._sess.run_with_ortvaluevector( - run_options, ["X"], vect_out, ["Z"], vect_out2, devices) - assert len(vect_out2) == 1 - return vect_out2[0] + run_options, ["X"], temp_vect_out, ["Z"], vect_out, devices) + assert len(vect_out) == 1 + return vect_out[0] def f_ort_vect_ov(X): "ort-vect-ov" vect_in = OrtValueVector() vect_in.push_back(X) - vect_out = OrtValueVector() sess_add2._sess.run_with_ortvaluevector( run_options, ["X"], vect_in, ["Z"], vect_out, devices) assert len(vect_out) == 1 @@ -213,32 +213,32 @@ def f_ort_vect_ov_eager_gpu(X): "ort-vect-ov-eager-gpu" vect_in = OrtValueVector() vect_in.push_back(X) - vect_out = OrtValueVector() + temp_vect_out = OrtValueVector() sess_add._sess.run_with_ortvaluevector( - run_options, ["X"], vect_in, ["Z"], vect_out, devices) - vect_out2 = OrtValueVector() + run_options, ["X"], vect_in, ["Z"], temp_vect_out, devices) sess_add._sess.run_with_ortvaluevector( - run_options, ["X"], vect_out, ["Z"], vect_out2, devices) - assert len(vect_out2) == 1 - return vect_out2[0] + run_options, ["X"], temp_vect_out, ["Z"], vect_out, devices) + assert len(vect_out) == 1 + return vect_out[0] def f_ort_vect_ov_gpu(X): "ort-vect-ov-gpu" vect_in = OrtValueVector() vect_in.push_back(X) - vect_out = OrtValueVector() sess_add2._sess.run_with_ortvaluevector( run_options, ["X"], vect_in, ["Z"], vect_out, devices) assert len(vect_out) == 1 return vect_out[0] else: - f_ort_vect_ov_eager = None - f_ort_vect_ov = None + f_ort_vect_ov_eager_gpu = None + f_ort_vect_ov_gpu = None else: f_ort_ov_eager_gpu = None f_ort_ov_gpu = None + f_ort_vect_ov_eager_gpu = None + f_ort_vect_ov_gpu = None ####################################### @@ -250,17 +250,17 @@ def f_ort_vect_ov_gpu(X): Xov = C_OrtValue.ortvalue_from_numpy(X, device) Ys = [ - f_numpy(X), - f_ort_eager(X), - f_ort(X), - f_ort_ov_eager(Xov), - f_ort_ov(Xov), + (f_numpy, X), + (f_ort_eager, X), + (f_ort, X), + (f_ort_ov_eager, Xov), + (f_ort_ov, Xov), ] if OrtValueVector is not None: Ys.extend([ - f_ort_vect_ov_eager(Xov), - f_ort_vect_ov(Xov), + (f_ort_vect_ov_eager, Xov), + (f_ort_vect_ov, Xov), ]) if sess_add_gpu is not None: @@ -268,13 +268,13 @@ def f_ort_vect_ov_gpu(X): try: Xov_gpu = C_OrtValue.ortvalue_from_numpy(X, device_gpu) Ys.extend([ - f_ort_ov_eager_gpu(Xov_gpu), - f_ort_ov_gpu(Xov_gpu), + (f_ort_ov_eager_gpu, Xov_gpu), + (f_ort_ov_gpu, Xov_gpu), ]) if OrtValueVector is not None: Ys.extend([ - f_ort_vect_ov_eager_gpu(Xov), - f_ort_vect_ov_gpu(Xov), + (f_ort_vect_ov_eager_gpu, Xov_gpu), + (f_ort_vect_ov_gpu, Xov_gpu), ]) except RuntimeError: # cuda is not available @@ -283,12 +283,17 @@ def f_ort_vect_ov_gpu(X): f_ort_ov_eager_gpu = None f_ort_ov_gpu = None -for i in range(1, len(Ys)): +results = [] +for fct, x in Ys: + print(f"check function {fct.__name__!r} and input type {x.__class__.__name__!r}") + results.append(fct(x)) + +for i in range(1, len(results)): try: - assert_allclose(Ys[0], Ys[i]) + assert_allclose(results[0], results[i]) except TypeError: # OrtValue - assert_allclose(Ys[0], Ys[i].numpy()) + assert_allclose(results[0], results[i].numpy()) ########################################## # All outputs are the same. @@ -301,7 +306,8 @@ def f_ort_vect_ov_gpu(X): def benchmark(repeat=100): fcts = [f_numpy, f_ort_eager, f_ort, f_ort_ov_eager, f_ort_ov, f_ort_vect_ov_eager, f_ort_vect_ov, - f_ort_ov_eager_gpu, f_ort_ov_gpu] + f_ort_ov_eager_gpu, f_ort_ov_gpu, + f_ort_vect_ov_gpu, f_ort_vect_ov_eager_gpu] data = [] for N in tqdm([1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000]): @@ -315,6 +321,7 @@ def benchmark(repeat=100): for f in fcts: if f is None: continue + print(N, f) obs = {'name': f.__doc__, "N": N} if "-gpu" in f.__doc__: begin = time.perf_counter() @@ -365,7 +372,6 @@ def subgraph(row, cols, title): fig.suptitle("Time execution Eager Add + Add") piv_all = df.pivot(index="N", columns="name", values="time") - print(piv_all.columns) # no gpu, no vect subgraph(0, [c for c in piv_all.columns From 299e11797a398ad2715a5071ff005db0b71a2c11 Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Mon, 28 Nov 2022 16:30:28 +0000 Subject: [PATCH 03/10] fix example --- _doc/examples/plot_benchmark_eager_mode.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index 4e87355..961ca8a 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -157,7 +157,6 @@ def f_ort_ov(X): if OrtValueVector is not None: - vect_out = OrtValueVector() run_options = RunOptions() devices = [C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0)] @@ -165,6 +164,7 @@ def f_ort_vect_ov_eager(X): "ort-vect-ov-eager" vect_in = OrtValueVector() vect_in.push_back(X) + vect_out = OrtValueVector() temp_vect_out = OrtValueVector() sess_add._sess.run_with_ortvaluevector( run_options, ["X"], vect_in, ["Z"], temp_vect_out, devices) @@ -178,6 +178,7 @@ def f_ort_vect_ov(X): "ort-vect-ov" vect_in = OrtValueVector() vect_in.push_back(X) + vect_out = OrtValueVector() sess_add2._sess.run_with_ortvaluevector( run_options, ["X"], vect_in, ["Z"], vect_out, devices) assert len(vect_out) == 1 @@ -213,6 +214,7 @@ def f_ort_vect_ov_eager_gpu(X): "ort-vect-ov-eager-gpu" vect_in = OrtValueVector() vect_in.push_back(X) + vect_out = OrtValueVector() temp_vect_out = OrtValueVector() sess_add._sess.run_with_ortvaluevector( run_options, ["X"], vect_in, ["Z"], temp_vect_out, devices) @@ -225,6 +227,8 @@ def f_ort_vect_ov_gpu(X): "ort-vect-ov-gpu" vect_in = OrtValueVector() vect_in.push_back(X) + vect_out = OrtValueVector() + # crashes on the next line sess_add2._sess.run_with_ortvaluevector( run_options, ["X"], vect_in, ["Z"], vect_out, devices) assert len(vect_out) == 1 @@ -273,8 +277,8 @@ def f_ort_vect_ov_gpu(X): ]) if OrtValueVector is not None: Ys.extend([ - (f_ort_vect_ov_eager_gpu, Xov_gpu), (f_ort_vect_ov_gpu, Xov_gpu), + (f_ort_vect_ov_eager_gpu, Xov_gpu), ]) except RuntimeError: # cuda is not available From 6e6cea656bed363245bc50248654a61c5eb428c8 Mon Sep 17 00:00:00 2001 From: xadupre Date: Mon, 28 Nov 2022 18:27:32 +0100 Subject: [PATCH 04/10] lint --- _doc/examples/plot_benchmark_eager_mode.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index 961ca8a..ed2c8f4 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -289,7 +289,8 @@ def f_ort_vect_ov_gpu(X): results = [] for fct, x in Ys: - print(f"check function {fct.__name__!r} and input type {x.__class__.__name__!r}") + print( + f"check function {fct.__name__!r} and input type {x.__class__.__name__!r}") results.append(fct(x)) for i in range(1, len(results)): From bee3822639b0b394f6f2128c77b1924e931564ef Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Mon, 28 Nov 2022 18:06:15 +0000 Subject: [PATCH 05/10] example --- _doc/examples/plot_benchmark_eager_mode.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index ed2c8f4..b9af949 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -368,13 +368,13 @@ def subgraph(row, cols, title): piv2 = piv / piv.index.values.reshape((-1, 1)) piv2.plot(ax=ax[row, 1], title=f"Time(s) per execution / N", logx=True) piv3 = piv / piv["numpy"].values.reshape((-1, 1)) - piv3.plot(ax=ax[row, 2], title="Ratio against numpy (lower is better", + piv3.plot(ax=ax[row, 2], title="Ratio against numpy", logy=True, logx=True) for j in range(0, 3): ax[row, j].legend(fontsize="x-small") fig, ax = plt.subplots(3, 3, figsize=(12, 8)) - fig.suptitle("Time execution Eager Add + Add") + fig.suptitle("Time execution Eager Add + Add - lower is better") piv_all = df.pivot(index="N", columns="name", values="time") From a91c3d15389e7aa4ccb0477e9ac9216e9daeb903 Mon Sep 17 00:00:00 2001 From: xadupre Date: Tue, 29 Nov 2022 08:57:12 +0100 Subject: [PATCH 06/10] Update index.rst --- .../source/tutorials/tutorial_parallel/index.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/_doc/sphinxdoc/source/tutorials/tutorial_parallel/index.rst b/_doc/sphinxdoc/source/tutorials/tutorial_parallel/index.rst index 57a3f61..9965c36 100644 --- a/_doc/sphinxdoc/source/tutorials/tutorial_parallel/index.rst +++ b/_doc/sphinxdoc/source/tutorials/tutorial_parallel/index.rst @@ -26,19 +26,18 @@ The tutorial was tested with following version: .. runpython:: :showcode: + import sys import numpy import scipy import onnx import onnxruntime import onnxcustom + import sklearn import torch print("python {}".format(sys.version_info)) - mods = [numpy, scipy, sklearn, lightgbm, xgboost, - onnx, onnxmltools, onnxruntime, onnxcustom, - onnxconverter_common, - skl2onnx, mlprodict, pyquickhelper, - torch] + mods = [numpy, scipy, sklearn, onnx, + onnxruntime, onnxcustom, torch] mods = [(m.__name__, m.__version__) for m in mods] mx = max(len(_[0]) for _ in mods) + 1 for name, vers in sorted(mods): From 1f6302a66a3f5817a288efa8cd760eaf184d99d3 Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Tue, 29 Nov 2022 08:49:24 +0000 Subject: [PATCH 07/10] final update --- .../data/plot_benchmark_eager_mode.csv | 155 ++++++++++++++++++ _doc/examples/plot_benchmark_eager_mode.py | 17 +- 2 files changed, 167 insertions(+), 5 deletions(-) create mode 100644 _doc/examples/data/plot_benchmark_eager_mode.csv diff --git a/_doc/examples/data/plot_benchmark_eager_mode.csv b/_doc/examples/data/plot_benchmark_eager_mode.csv new file mode 100644 index 0000000..93985cd --- /dev/null +++ b/_doc/examples/data/plot_benchmark_eager_mode.csv @@ -0,0 +1,155 @@ +name,N,time +numpy,1,1.5889492351561784e-06 +ort-eager,1,1.8746450077742338e-05 +ort,1,9.613720467314124e-06 +ort-ov-eager,1,1.5422540018334983e-05 +ort-ov,1,8.95573990419507e-06 +ort-vect-ov-eager,1,1.614851993508637e-05 +ort-vect-ov,1,1.1713659623637796e-05 +ort-ov-eager-gpu,1,8.771141991019249e-05 +ort-ov-gpu,1,4.501666990108788e-05 +ort-vect-ov-gpu,1,3.254704992286861e-05 +ort-vect-ov-eager-gpu,1,4.816057975403964e-05 +numpy,2,3.187909023836255e-06 +ort-eager,2,1.9348430214449763e-05 +ort,2,1.0068699484691024e-05 +ort-ov-eager,2,1.546354033052921e-05 +ort-ov,2,9.566720109432936e-06 +ort-vect-ov-eager,2,1.6886509256437422e-05 +ort-vect-ov,2,1.400659093633294e-05 +ort-ov-eager-gpu,2,8.901738096028566e-05 +ort-ov-gpu,2,4.58646600600332e-05 +ort-vect-ov-gpu,2,3.3805009443312884e-05 +ort-vect-ov-eager-gpu,2,4.979153978638351e-05 +numpy,5,3.660890506580472e-06 +ort-eager,5,2.0106409210711717e-05 +ort,5,1.145766000263393e-05 +ort-ov-eager,5,1.6180520178750156e-05 +ort-ov,5,9.363730205222964e-06 +ort-vect-ov-eager,5,1.7814469756558538e-05 +ort-vect-ov,5,1.1610660003498196e-05 +ort-ov-eager-gpu,5,8.837740053422749e-05 +ort-ov-gpu,5,4.6353639336302874e-05 +ort-vect-ov-gpu,5,3.280502976849675e-05 +ort-vect-ov-eager-gpu,5,4.807559074833989e-05 +numpy,10,4.793859552592039e-06 +ort-eager,10,1.981140929274261e-05 +ort,10,1.1489660246297716e-05 +ort-ov-eager,10,1.75704900175333e-05 +ort-ov,10,1.0141700040549039e-05 +ort-vect-ov-eager,10,1.800447003915906e-05 +ort-vect-ov,10,1.2603630311787129e-05 +ort-ov-eager-gpu,10,9.04553395230323e-05 +ort-ov-gpu,10,4.520967020653188e-05 +ort-vect-ov-gpu,10,3.2696030102670194e-05 +ort-vect-ov-eager-gpu,10,6.418311037123203e-05 +numpy,20,5.4798403289169074e-06 +ort-eager,20,2.366730011999607e-05 +ort,20,1.4248579973354935e-05 +ort-ov-eager,20,1.8401460256427526e-05 +ort-ov,20,1.2100640451535583e-05 +ort-vect-ov-eager,20,1.9405430648475886e-05 +ort-vect-ov,20,1.3894590083509683e-05 +ort-ov-eager-gpu,20,9.494220954366028e-05 +ort-ov-gpu,20,4.660362959839404e-05 +ort-vect-ov-gpu,20,3.3001030096784234e-05 +ort-vect-ov-eager-gpu,20,4.779958981089294e-05 +numpy,50,9.599719196558e-06 +ort-eager,50,2.8269169852137566e-05 +ort,50,1.7217489657923578e-05 +ort-ov-eager,50,2.1917350823059676e-05 +ort-ov,50,1.4976559905335307e-05 +ort-vect-ov-eager,50,2.3295320570468902e-05 +ort-vect-ov,50,1.7603479791432618e-05 +ort-ov-eager-gpu,50,0.0001088908000383526 +ort-ov-gpu,50,4.983353079296649e-05 +ort-vect-ov-gpu,50,3.4302989952266217e-05 +ort-vect-ov-eager-gpu,50,4.907056107185781e-05 +numpy,100,1.5986530343070626e-05 +ort-eager,100,3.6064939340576526e-05 +ort,100,2.423427999019623e-05 +ort-ov-eager,100,2.781018032692373e-05 +ort-ov,100,2.0978389075025915e-05 +ort-vect-ov-eager,100,2.973912050947547e-05 +ort-vect-ov,100,2.3074320051819085e-05 +ort-ov-eager-gpu,100,0.0001250813202932477 +ort-ov-gpu,100,5.6498339399695394e-05 +ort-vect-ov-gpu,100,3.4142000367864965e-05 +ort-vect-ov-eager-gpu,100,4.825657932087779e-05 +numpy,200,2.8386160265654327e-05 +ort-eager,200,5.3042439976707104e-05 +ort,200,3.7272900808602575e-05 +ort-ov-eager,200,4.0436809649690984e-05 +ort-ov,200,3.121608984656632e-05 +ort-vect-ov-eager,200,4.1092789033427834e-05 +ort-vect-ov,200,3.298502997495234e-05 +ort-ov-eager-gpu,200,0.0001585603307466954 +ort-ov-gpu,200,6.756402086466551e-05 +ort-vect-ov-gpu,200,3.387400065548718e-05 +ort-vect-ov-eager-gpu,200,4.802458919584751e-05 +numpy,500,6.516507943160832e-05 +ort-eager,500,0.00010077803977765143 +ort,500,7.354783010669053e-05 +ort-ov-eager,500,6.849798955954611e-05 +ort-ov,500,5.758829996921122e-05 +ort-vect-ov-eager,500,6.897097919136285e-05 +ort-vect-ov,500,5.866626976057887e-05 +ort-ov-eager-gpu,500,0.00024823170038871465 +ort-ov-gpu,500,9.84691095072776e-05 +ort-vect-ov-gpu,500,3.38930101133883e-05 +ort-vect-ov-eager-gpu,500,5.276144947856665e-05 +numpy,1000,0.00012337637948803604 +ort-eager,1000,0.00014440876082517207 +ort,1000,9.04593407176435e-05 +ort-ov-eager,1000,7.919566938653588e-05 +ort-ov,1000,0.00010468191932886839 +ort-vect-ov-eager,1000,0.00018712750053964555 +ort-vect-ov,1000,5.026051891036332e-05 +ort-ov-eager-gpu,1000,0.000405196089996025 +ort-ov-gpu,1000,0.00014712668024003506 +ort-vect-ov-gpu,1000,3.8283870089799166e-05 +ort-vect-ov-eager-gpu,1000,5.308343912474811e-05 +numpy,2000,0.0002513446100056171 +ort-eager,2000,0.00024444581009447575 +ort,2000,0.00018896444933488965 +ort-ov-eager,2000,8.169659995473921e-05 +ort-ov,2000,5.474239005707204e-05 +ort-vect-ov-eager,2000,0.00011809753021225334 +ort-vect-ov,2000,0.00013922089943662285 +ort-ov-eager-gpu,2000,0.0006366402900312096 +ort-ov-gpu,2000,0.00018454457982443274 +ort-vect-ov-gpu,2000,2.582623972557485e-05 +ort-vect-ov-eager-gpu,2000,3.8639859994873404e-05 +numpy,5000,0.0006005673401523381 +ort-eager,5000,0.0008884638792369515 +ort,5000,0.00045122973038814963 +ort-ov-eager,5000,0.00013310309033840896 +ort-ov,5000,0.00014269580016843976 +ort-vect-ov-eager,5000,0.00034576083067804574 +ort-vect-ov,5000,0.0002888415101915598 +ort-ov-eager-gpu,5000,0.0011237199604511262 +ort-ov-gpu,5000,0.00042649045935831963 +ort-vect-ov-gpu,5000,3.0463109724223613e-05 +ort-vect-ov-eager-gpu,5000,4.1913760360330346e-05 +numpy,10000,0.0011754754395224155 +ort-eager,10000,0.0014254870894365012 +ort,10000,0.0007287365710362793 +ort-ov-eager,10000,0.00047812893986701963 +ort-ov,10000,0.0002968422707635909 +ort-vect-ov-eager,10000,0.00027544289943762126 +ort-vect-ov,10000,0.0004464628698769957 +ort-ov-eager-gpu,10000,0.002028988350648433 +ort-ov-gpu,10000,0.0006974474899470806 +ort-vect-ov-gpu,10000,3.840886987745762e-05 +ort-vect-ov-eager-gpu,10000,5.110349971801042e-05 +numpy,20000,0.0025801266194321213 +ort-eager,20000,0.003030839919811115 +ort,20000,0.0016020383301656694 +ort-ov-eager,20000,0.00041621136013418435 +ort-ov,20000,0.0004534762294497341 +ort-vect-ov-eager,20000,0.0005605259700678289 +ort-vect-ov,20000,0.0006344887206796557 +ort-ov-eager-gpu,20000,0.0038356904697138816 +ort-ov-gpu,20000,0.0013019994494970887 +ort-vect-ov-gpu,20000,7.951957988552749e-05 +ort-vect-ov-eager-gpu,20000,0.00010235088993795216 diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index b9af949..f167976 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -216,9 +216,9 @@ def f_ort_vect_ov_eager_gpu(X): vect_in.push_back(X) vect_out = OrtValueVector() temp_vect_out = OrtValueVector() - sess_add._sess.run_with_ortvaluevector( + sess_add_gpu._sess.run_with_ortvaluevector( run_options, ["X"], vect_in, ["Z"], temp_vect_out, devices) - sess_add._sess.run_with_ortvaluevector( + sess_add_gpu._sess.run_with_ortvaluevector( run_options, ["X"], temp_vect_out, ["Z"], vect_out, devices) assert len(vect_out) == 1 return vect_out[0] @@ -229,7 +229,7 @@ def f_ort_vect_ov_gpu(X): vect_in.push_back(X) vect_out = OrtValueVector() # crashes on the next line - sess_add2._sess.run_with_ortvaluevector( + sess_add2_gpu._sess.run_with_ortvaluevector( run_options, ["X"], vect_in, ["Z"], vect_out, devices) assert len(vect_out) == 1 return vect_out[0] @@ -326,7 +326,6 @@ def benchmark(repeat=100): for f in fcts: if f is None: continue - print(N, f) obs = {'name': f.__doc__, "N": N} if "-gpu" in f.__doc__: begin = time.perf_counter() @@ -408,7 +407,15 @@ def subgraph(row, cols, title): # is using the direct python API. This could be improved by using :epkg:`cython`. # Eager mode must use :epkg:`OrtValue`. It is faster and it reduces the differences # between using two additions in a single graph or two graphs of a single addition -# on CPU. On GPU, it is still faster but eager mode is significantly slower. +# on CPU. On GPU, it is still faster but eager mode is slighly slower with +# method `run_with_ortvaluevector`. +# +# However, method `run_with_ort_values` is not recommended +# because the output device cannot be specified. Therefore, +# :epkg:`onnxruntime` requests the output on CPU. On eager mode, +# this output is used again an input for the second call to +# `run_with_ort_values` and the data needs to be copied from CPU +# to GPU. if not has_cuda: print("With GPU") From fa58e4207356e78301c4158e4ee0622c9ba649a6 Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Tue, 29 Nov 2022 09:44:01 +0000 Subject: [PATCH 08/10] add bind api --- .../data/plot_benchmark_eager_mode.csv | 364 ++++++++++-------- _doc/examples/plot_benchmark_eager_mode.py | 120 ++++-- 2 files changed, 304 insertions(+), 180 deletions(-) diff --git a/_doc/examples/data/plot_benchmark_eager_mode.csv b/_doc/examples/data/plot_benchmark_eager_mode.csv index 93985cd..53be524 100644 --- a/_doc/examples/data/plot_benchmark_eager_mode.csv +++ b/_doc/examples/data/plot_benchmark_eager_mode.csv @@ -1,155 +1,211 @@ name,N,time -numpy,1,1.5889492351561784e-06 -ort-eager,1,1.8746450077742338e-05 -ort,1,9.613720467314124e-06 -ort-ov-eager,1,1.5422540018334983e-05 -ort-ov,1,8.95573990419507e-06 -ort-vect-ov-eager,1,1.614851993508637e-05 -ort-vect-ov,1,1.1713659623637796e-05 -ort-ov-eager-gpu,1,8.771141991019249e-05 -ort-ov-gpu,1,4.501666990108788e-05 -ort-vect-ov-gpu,1,3.254704992286861e-05 -ort-vect-ov-eager-gpu,1,4.816057975403964e-05 -numpy,2,3.187909023836255e-06 -ort-eager,2,1.9348430214449763e-05 -ort,2,1.0068699484691024e-05 -ort-ov-eager,2,1.546354033052921e-05 -ort-ov,2,9.566720109432936e-06 -ort-vect-ov-eager,2,1.6886509256437422e-05 -ort-vect-ov,2,1.400659093633294e-05 -ort-ov-eager-gpu,2,8.901738096028566e-05 -ort-ov-gpu,2,4.58646600600332e-05 -ort-vect-ov-gpu,2,3.3805009443312884e-05 -ort-vect-ov-eager-gpu,2,4.979153978638351e-05 -numpy,5,3.660890506580472e-06 -ort-eager,5,2.0106409210711717e-05 -ort,5,1.145766000263393e-05 -ort-ov-eager,5,1.6180520178750156e-05 -ort-ov,5,9.363730205222964e-06 -ort-vect-ov-eager,5,1.7814469756558538e-05 -ort-vect-ov,5,1.1610660003498196e-05 -ort-ov-eager-gpu,5,8.837740053422749e-05 -ort-ov-gpu,5,4.6353639336302874e-05 -ort-vect-ov-gpu,5,3.280502976849675e-05 -ort-vect-ov-eager-gpu,5,4.807559074833989e-05 -numpy,10,4.793859552592039e-06 -ort-eager,10,1.981140929274261e-05 -ort,10,1.1489660246297716e-05 -ort-ov-eager,10,1.75704900175333e-05 -ort-ov,10,1.0141700040549039e-05 -ort-vect-ov-eager,10,1.800447003915906e-05 -ort-vect-ov,10,1.2603630311787129e-05 -ort-ov-eager-gpu,10,9.04553395230323e-05 -ort-ov-gpu,10,4.520967020653188e-05 -ort-vect-ov-gpu,10,3.2696030102670194e-05 -ort-vect-ov-eager-gpu,10,6.418311037123203e-05 -numpy,20,5.4798403289169074e-06 -ort-eager,20,2.366730011999607e-05 -ort,20,1.4248579973354935e-05 -ort-ov-eager,20,1.8401460256427526e-05 -ort-ov,20,1.2100640451535583e-05 -ort-vect-ov-eager,20,1.9405430648475886e-05 -ort-vect-ov,20,1.3894590083509683e-05 -ort-ov-eager-gpu,20,9.494220954366028e-05 -ort-ov-gpu,20,4.660362959839404e-05 -ort-vect-ov-gpu,20,3.3001030096784234e-05 -ort-vect-ov-eager-gpu,20,4.779958981089294e-05 -numpy,50,9.599719196558e-06 -ort-eager,50,2.8269169852137566e-05 -ort,50,1.7217489657923578e-05 -ort-ov-eager,50,2.1917350823059676e-05 -ort-ov,50,1.4976559905335307e-05 -ort-vect-ov-eager,50,2.3295320570468902e-05 -ort-vect-ov,50,1.7603479791432618e-05 -ort-ov-eager-gpu,50,0.0001088908000383526 -ort-ov-gpu,50,4.983353079296649e-05 -ort-vect-ov-gpu,50,3.4302989952266217e-05 -ort-vect-ov-eager-gpu,50,4.907056107185781e-05 -numpy,100,1.5986530343070626e-05 -ort-eager,100,3.6064939340576526e-05 -ort,100,2.423427999019623e-05 -ort-ov-eager,100,2.781018032692373e-05 -ort-ov,100,2.0978389075025915e-05 -ort-vect-ov-eager,100,2.973912050947547e-05 -ort-vect-ov,100,2.3074320051819085e-05 -ort-ov-eager-gpu,100,0.0001250813202932477 -ort-ov-gpu,100,5.6498339399695394e-05 -ort-vect-ov-gpu,100,3.4142000367864965e-05 -ort-vect-ov-eager-gpu,100,4.825657932087779e-05 -numpy,200,2.8386160265654327e-05 -ort-eager,200,5.3042439976707104e-05 -ort,200,3.7272900808602575e-05 -ort-ov-eager,200,4.0436809649690984e-05 -ort-ov,200,3.121608984656632e-05 -ort-vect-ov-eager,200,4.1092789033427834e-05 -ort-vect-ov,200,3.298502997495234e-05 -ort-ov-eager-gpu,200,0.0001585603307466954 -ort-ov-gpu,200,6.756402086466551e-05 -ort-vect-ov-gpu,200,3.387400065548718e-05 -ort-vect-ov-eager-gpu,200,4.802458919584751e-05 -numpy,500,6.516507943160832e-05 -ort-eager,500,0.00010077803977765143 -ort,500,7.354783010669053e-05 -ort-ov-eager,500,6.849798955954611e-05 -ort-ov,500,5.758829996921122e-05 -ort-vect-ov-eager,500,6.897097919136285e-05 -ort-vect-ov,500,5.866626976057887e-05 -ort-ov-eager-gpu,500,0.00024823170038871465 -ort-ov-gpu,500,9.84691095072776e-05 -ort-vect-ov-gpu,500,3.38930101133883e-05 -ort-vect-ov-eager-gpu,500,5.276144947856665e-05 -numpy,1000,0.00012337637948803604 -ort-eager,1000,0.00014440876082517207 -ort,1000,9.04593407176435e-05 -ort-ov-eager,1000,7.919566938653588e-05 -ort-ov,1000,0.00010468191932886839 -ort-vect-ov-eager,1000,0.00018712750053964555 -ort-vect-ov,1000,5.026051891036332e-05 -ort-ov-eager-gpu,1000,0.000405196089996025 -ort-ov-gpu,1000,0.00014712668024003506 -ort-vect-ov-gpu,1000,3.8283870089799166e-05 -ort-vect-ov-eager-gpu,1000,5.308343912474811e-05 -numpy,2000,0.0002513446100056171 -ort-eager,2000,0.00024444581009447575 -ort,2000,0.00018896444933488965 -ort-ov-eager,2000,8.169659995473921e-05 -ort-ov,2000,5.474239005707204e-05 -ort-vect-ov-eager,2000,0.00011809753021225334 -ort-vect-ov,2000,0.00013922089943662285 -ort-ov-eager-gpu,2000,0.0006366402900312096 -ort-ov-gpu,2000,0.00018454457982443274 -ort-vect-ov-gpu,2000,2.582623972557485e-05 -ort-vect-ov-eager-gpu,2000,3.8639859994873404e-05 -numpy,5000,0.0006005673401523381 -ort-eager,5000,0.0008884638792369515 -ort,5000,0.00045122973038814963 -ort-ov-eager,5000,0.00013310309033840896 -ort-ov,5000,0.00014269580016843976 -ort-vect-ov-eager,5000,0.00034576083067804574 -ort-vect-ov,5000,0.0002888415101915598 -ort-ov-eager-gpu,5000,0.0011237199604511262 -ort-ov-gpu,5000,0.00042649045935831963 -ort-vect-ov-gpu,5000,3.0463109724223613e-05 -ort-vect-ov-eager-gpu,5000,4.1913760360330346e-05 -numpy,10000,0.0011754754395224155 -ort-eager,10000,0.0014254870894365012 -ort,10000,0.0007287365710362793 -ort-ov-eager,10000,0.00047812893986701963 -ort-ov,10000,0.0002968422707635909 -ort-vect-ov-eager,10000,0.00027544289943762126 -ort-vect-ov,10000,0.0004464628698769957 -ort-ov-eager-gpu,10000,0.002028988350648433 -ort-ov-gpu,10000,0.0006974474899470806 -ort-vect-ov-gpu,10000,3.840886987745762e-05 -ort-vect-ov-eager-gpu,10000,5.110349971801042e-05 -numpy,20000,0.0025801266194321213 -ort-eager,20000,0.003030839919811115 -ort,20000,0.0016020383301656694 -ort-ov-eager,20000,0.00041621136013418435 -ort-ov,20000,0.0004534762294497341 -ort-vect-ov-eager,20000,0.0005605259700678289 -ort-vect-ov,20000,0.0006344887206796557 -ort-ov-eager-gpu,20000,0.0038356904697138816 -ort-ov-gpu,20000,0.0013019994494970887 -ort-vect-ov-gpu,20000,7.951957988552749e-05 -ort-vect-ov-eager-gpu,20000,0.00010235088993795216 +numpy,1,1.2711102034680112e-06 +ort-eager,1,1.13358634176204e-05 +ort,1,6.808883180725622e-06 +ort-ov-eager,1,1.1388225820771748e-05 +ort-ov,1,6.520444517861111e-06 +ort-vect-ov-eager,1,1.2170951596653534e-05 +ort-vect-ov,1,8.38620085684813e-06 +ort-ov-bind-eager,1,1.3180973615906224e-05 +ort-ov-bind,1,8.696321750822233e-06 +ort-ov-eager-gpu,1,7.056212242768735e-05 +ort-ov-gpu,1,3.481692618334281e-05 +ort-vect-ov-eager-gpu,1,3.555464748193922e-05 +ort-vect-ov-gpu,1,2.4811498815044608e-05 +ort-ov-bind-eager-gpu,1,4.045496078652671e-05 +ort-ov-bind-gpu,1,2.7586736226665607e-05 +numpy,2,2.734601275551176e-06 +ort-eager,2,1.2075403671577034e-05 +ort,2,7.642062485847675e-06 +ort-ov-eager,2,1.221340531746166e-05 +ort-ov,2,7.38951311719538e-06 +ort-vect-ov-eager,2,1.2355767047175994e-05 +ort-vect-ov,2,9.376237369151918e-06 +ort-ov-bind-eager,2,1.3248451201744923e-05 +ort-ov-bind,2,9.651492869523173e-06 +ort-ov-eager-gpu,2,7.31769550059523e-05 +ort-ov-gpu,2,3.5939188217001456e-05 +ort-vect-ov-eager-gpu,2,3.82672866985782e-05 +ort-vect-ov-gpu,2,3.82050041403828e-05 +ort-ov-bind-eager-gpu,2,3.984901017352264e-05 +ort-ov-bind-gpu,2,2.7437667967271557e-05 +numpy,5,3.371258473438108e-06 +ort-eager,5,1.315288568274623e-05 +ort,5,7.733609667891108e-06 +ort-ov-eager,5,1.1391451651786244e-05 +ort-ov,5,7.110731346965438e-06 +ort-vect-ov-eager,5,1.2713854214334899e-05 +ort-vect-ov,5,8.936898459814504e-06 +ort-ov-bind-eager,5,1.3004758204068473e-05 +ort-ov-bind,5,9.634384796281332e-06 +ort-ov-eager-gpu,5,7.204147345595518e-05 +ort-ov-gpu,5,3.4798317110824925e-05 +ort-vect-ov-eager-gpu,5,3.641794677098572e-05 +ort-vect-ov-gpu,5,2.5500802780277797e-05 +ort-ov-bind-eager-gpu,5,3.897886624590421e-05 +ort-ov-bind-gpu,5,2.6711501113916796e-05 +numpy,10,3.99789578824459e-06 +ort-eager,10,1.2989837375196468e-05 +ort,10,8.63157538790818e-06 +ort-ov-eager,10,1.188740721009972e-05 +ort-ov,10,7.536177843282319e-06 +ort-vect-ov-eager,10,1.2986518222054369e-05 +ort-vect-ov,10,9.170969528794047e-06 +ort-ov-bind-eager,10,1.3924449184394586e-05 +ort-ov-bind,10,1.0086918655638783e-05 +ort-ov-eager-gpu,10,7.289471841246194e-05 +ort-ov-gpu,10,3.498931489656308e-05 +ort-vect-ov-eager-gpu,10,3.650360053679982e-05 +ort-vect-ov-gpu,10,2.4900201075085014e-05 +ort-ov-bind-eager-gpu,10,3.90811277641589e-05 +ort-ov-bind-gpu,10,2.6648614517192252e-05 +numpy,20,5.098671245506269e-06 +ort-eager,20,1.54399778226383e-05 +ort,20,9.723499106239505e-06 +ort-ov-eager,20,1.3414586822862827e-05 +ort-ov,20,8.937757405819314e-06 +ort-vect-ov-eager,20,1.453850413616068e-05 +ort-vect-ov,20,1.0608864048945493e-05 +ort-ov-bind-eager,20,1.5095967328085041e-05 +ort-ov-bind,20,1.1017649669708521e-05 +ort-ov-eager-gpu,20,7.634075496307745e-05 +ort-ov-gpu,20,3.63207728828633e-05 +ort-vect-ov-eager-gpu,20,3.689129719487773e-05 +ort-vect-ov-gpu,20,2.45928172943865e-05 +ort-ov-bind-eager-gpu,20,3.898421608087886e-05 +ort-ov-bind-gpu,20,2.8260121573261993e-05 +numpy,50,9.188749538155977e-06 +ort-eager,50,1.9810953781562276e-05 +ort,50,1.3596643890503427e-05 +ort-ov-eager,50,1.6675594745339046e-05 +ort-ov,50,1.19556604668197e-05 +ort-vect-ov-eager,50,1.8200560648235596e-05 +ort-vect-ov,50,1.3913243298719186e-05 +ort-ov-bind-eager,50,1.8310926928434794e-05 +ort-ov-bind,50,1.4871118216568608e-05 +ort-ov-eager-gpu,50,8.813127975112625e-05 +ort-ov-gpu,50,3.9916390796898574e-05 +ort-vect-ov-eager-gpu,50,3.693576212247193e-05 +ort-vect-ov-gpu,50,2.4740043233934736e-05 +ort-ov-bind-eager-gpu,50,3.9929252907388496e-05 +ort-ov-bind-gpu,50,2.6972319681159e-05 +numpy,100,1.5112248534907798e-05 +ort-eager,100,2.7329642718366113e-05 +ort,100,1.9632307746985428e-05 +ort-ov-eager,100,2.2391379095478763e-05 +ort-ov,100,1.6873115156259802e-05 +ort-vect-ov-eager,100,2.6304204571192804e-05 +ort-vect-ov,100,1.8689596296323967e-05 +ort-ov-bind-eager,100,2.377623776390361e-05 +ort-ov-bind,100,1.9281802418245318e-05 +ort-ov-eager-gpu,100,0.00010847913871082117 +ort-ov-gpu,100,4.537494692204859e-05 +ort-vect-ov-eager-gpu,100,3.7859098558298876e-05 +ort-vect-ov-gpu,100,2.502607185493015e-05 +ort-ov-bind-eager-gpu,100,4.014694452891318e-05 +ort-ov-bind-gpu,100,2.7043863977353597e-05 +numpy,200,2.917542704614346e-05 +ort-eager,200,4.151081535441569e-05 +ort,200,3.140525549455308e-05 +ort-ov-eager,200,3.316792525208345e-05 +ort-ov,200,2.6052282897360398e-05 +ort-vect-ov-eager,200,3.334714772805632e-05 +ort-vect-ov,200,2.832363693054924e-05 +ort-ov-bind-eager,200,3.367213000414696e-05 +ort-ov-bind,200,2.900986954031129e-05 +ort-ov-eager-gpu,200,0.00014148382873901602 +ort-ov-gpu,200,6.213097129367551e-05 +ort-vect-ov-eager-gpu,200,3.718186268147814e-05 +ort-vect-ov-gpu,200,2.4685012253127547e-05 +ort-ov-bind-eager-gpu,200,3.9644076090321376e-05 +ort-ov-bind-gpu,200,2.7330868070964348e-05 +numpy,500,6.52488797458863e-05 +ort-eager,500,8.870200997398021e-05 +ort,500,6.801239449073611e-05 +ort-ov-eager,500,6.0614623033606835e-05 +ort-ov,500,5.285868684601302e-05 +ort-vect-ov-eager,500,6.496559108347305e-05 +ort-vect-ov,500,5.4099669294624495e-05 +ort-ov-bind-eager,500,6.085498981571537e-05 +ort-ov-bind,500,5.60753888691455e-05 +ort-ov-eager-gpu,500,0.0002297830632507649 +ort-ov-gpu,500,0.00010358515326009876 +ort-vect-ov-eager-gpu,500,3.7701112708672274e-05 +ort-vect-ov-gpu,500,2.5178429152380514e-05 +ort-ov-bind-eager-gpu,500,4.01915926792862e-05 +ort-ov-bind-gpu,500,2.758014015853405e-05 +numpy,1000,0.00012219335462987602 +ort-eager,1000,0.0001355954237847054 +ort,1000,0.00011039443873431181 +ort-ov-eager,1000,8.403399988310412e-05 +ort-ov,1000,6.696581017376497e-05 +ort-vect-ov-eager,1000,6.78829272345523e-05 +ort-vect-ov,1000,6.176580714617249e-05 +ort-ov-bind-eager,1000,7.399409556026926e-05 +ort-ov-bind,1000,8.521743366293715e-05 +ort-ov-eager-gpu,1000,0.00037906445510571405 +ort-ov-gpu,1000,0.00011412395285309153 +ort-vect-ov-eager-gpu,1000,4.747279511276083e-05 +ort-vect-ov-gpu,1000,2.876006557321218e-05 +ort-ov-bind-eager-gpu,1000,4.042265625333283e-05 +ort-ov-bind-gpu,1000,2.8134765074655568e-05 +numpy,2000,0.0002388510924116914 +ort-eager,2000,0.00044227690687553297 +ort,2000,0.00020837497959660012 +ort-ov-eager,2000,0.00013327595543086043 +ort-ov,2000,0.00010776344534693932 +ort-vect-ov-eager,2000,0.0001355220409125456 +ort-vect-ov,2000,7.620716032499271e-05 +ort-ov-bind-eager,2000,0.00017507457412964056 +ort-ov-bind,2000,7.764993359129954e-05 +ort-ov-eager-gpu,2000,0.0006141934830035704 +ort-ov-gpu,2000,0.00018641628445784667 +ort-vect-ov-eager-gpu,2000,3.88771848275694e-05 +ort-vect-ov-gpu,2000,2.6443645550637557e-05 +ort-ov-bind-eager-gpu,2000,4.167438551815132e-05 +ort-ov-bind-gpu,2000,2.8127812820070603e-05 +numpy,5000,0.0006044526570335482 +ort-eager,5000,0.0008715660814956135 +ort,5000,0.00042021945334932547 +ort-ov-eager,5000,0.0001657667077476314 +ort-ov,5000,0.000347322947345674 +ort-vect-ov-eager,5000,0.00019283137195050083 +ort-vect-ov,5000,0.00017095919352986158 +ort-ov-bind-eager,5000,0.00017653256604124022 +ort-ov-bind,5000,0.00037077100113402684 +ort-ov-eager-gpu,5000,0.0010676379005114236 +ort-ov-gpu,5000,0.00037449517998886244 +ort-vect-ov-eager-gpu,5000,4.251485171897168e-05 +ort-vect-ov-gpu,5000,2.990845970852279e-05 +ort-ov-bind-eager-gpu,5000,4.457795402295021e-05 +ort-ov-bind-gpu,5000,3.0521600616767125e-05 +numpy,10000,0.0011958029187683547 +ort-eager,10000,0.003115397562699703 +ort,10000,0.0012505987010143222 +ort-ov-eager,10000,0.0003016990199482635 +ort-ov,10000,0.0002381296440338095 +ort-vect-ov-eager,10000,0.00032288288507102567 +ort-vect-ov,10000,0.0003793603486143226 +ort-ov-bind-eager,10000,0.000622660714379024 +ort-ov-bind,10000,0.00017427370838094048 +ort-ov-eager-gpu,10000,0.0021718234987929464 +ort-ov-gpu,10000,0.0008711783358683953 +ort-vect-ov-eager-gpu,10000,6.672728953785018e-05 +ort-vect-ov-gpu,10000,4.796367579114598e-05 +ort-ov-bind-eager-gpu,10000,9.261158346715901e-05 +ort-ov-bind-gpu,10000,4.8890200975750176e-05 +numpy,20000,0.0027364153356757015 +ort-eager,20000,0.0067275110429719735 +ort,20000,0.003118208863518455 +ort-ov-eager,20000,0.0005569194742877569 +ort-ov,20000,0.001740831701317802 +ort-vect-ov-eager,20000,0.0009374135284145412 +ort-vect-ov,20000,0.0009072552202269435 +ort-ov-bind-eager,20000,0.0008944365921813776 +ort-ov-bind,20000,0.002242563001345843 +ort-ov-eager-gpu,20000,0.00445372259709984 +ort-ov-gpu,20000,0.001619209784881345 +ort-vect-ov-eager-gpu,20000,9.884369165564959e-05 +ort-vect-ov-gpu,20000,8.044799324125051e-05 +ort-ov-bind-eager-gpu,20000,0.00011459709441458637 +ort-ov-bind-gpu,20000,7.989798905327916e-05 diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index f167976..74ea2bd 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -51,7 +51,8 @@ RunOptions) from onnxruntime.capi._pybind_state import ( # pylint: disable=E0611 OrtDevice as C_OrtDevice, - OrtMemType, OrtValue as C_OrtValue) + OrtMemType, OrtValue as C_OrtValue, + SessionIOBinding as C_SessionIOBinding) try: from onnxruntime.capi._pybind_state import OrtValueVector except ImportError: @@ -150,6 +151,30 @@ def f_ort_ov(X): Z = sess_add2._sess.run_with_ort_values({'X': X}, ['Z'], None)[0] return Z + +cpu_device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0) + + +def f_ort_ov_bind_eager(X): + "ort-ov-bind-eager" + bind = C_SessionIOBinding(sess_add._sess) + bind.bind_ortvalue_input("X", X) + bind.bind_output("Z", cpu_device) + sess_add._sess.run_with_iobinding(bind, None) + T = bind.get_outputs()[0] + bind.bind_ortvalue_input("X", T) + sess_add._sess.run_with_iobinding(bind, None) + return bind.get_outputs()[0] + + +def f_ort_ov_bind(X): + "ort-ov-bind" + bind = C_SessionIOBinding(sess_add2._sess) + bind.bind_ortvalue_input("X", X) + bind.bind_output("Z", cpu_device) + sess_add2._sess.run_with_iobinding(bind, None) + return bind.get_outputs()[0] + ####################################### # onnxruntime >= 1.14 introduces a vector of OrtValues # to bypass the building of a dictionary. @@ -205,6 +230,27 @@ def f_ort_ov_gpu(X): Z = sess_add2_gpu._sess.run_with_ort_values({'X': X}, ['Z'], None)[0] return Z + gpu_device = C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0) + + def f_ort_ov_bind_eager_gpu(X): + "ort-ov-bind-eager-gpu" + bind = C_SessionIOBinding(sess_add_gpu._sess) + bind.bind_ortvalue_input("X", X) + bind.bind_output("Z", gpu_device) + sess_add_gpu._sess.run_with_iobinding(bind, None) + T = bind.get_outputs()[0] + bind.bind_ortvalue_input("X", T) + sess_add_gpu._sess.run_with_iobinding(bind, None) + return bind.get_outputs()[0] + + def f_ort_ov_bind_gpu(X): + "ort-ov-bind-gpu" + bind = C_SessionIOBinding(sess_add2_gpu._sess) + bind.bind_ortvalue_input("X", X) + bind.bind_output("Z", gpu_device) + sess_add2_gpu._sess.run_with_iobinding(bind, None) + return bind.get_outputs()[0] + if OrtValueVector is not None: run_options = RunOptions() @@ -259,6 +305,8 @@ def f_ort_vect_ov_gpu(X): (f_ort, X), (f_ort_ov_eager, Xov), (f_ort_ov, Xov), + (f_ort_ov_bind_eager, Xov), + (f_ort_ov_bind, Xov), ] if OrtValueVector is not None: @@ -274,6 +322,8 @@ def f_ort_vect_ov_gpu(X): Ys.extend([ (f_ort_ov_eager_gpu, Xov_gpu), (f_ort_ov_gpu, Xov_gpu), + (f_ort_ov_bind_eager_gpu, Xov_gpu), + (f_ort_ov_bind_gpu, Xov_gpu), ]) if OrtValueVector is not None: Ys.extend([ @@ -308,11 +358,15 @@ def f_ort_vect_ov_gpu(X): # +++++++++++++++++++++++ -def benchmark(repeat=100): - fcts = [f_numpy, f_ort_eager, f_ort, f_ort_ov_eager, f_ort_ov, - f_ort_vect_ov_eager, f_ort_vect_ov, - f_ort_ov_eager_gpu, f_ort_ov_gpu, - f_ort_vect_ov_gpu, f_ort_vect_ov_eager_gpu] +def benchmark(repeat=500000): + fcts = [ + f_numpy, f_ort_eager, f_ort, f_ort_ov_eager, f_ort_ov, + f_ort_vect_ov_eager, f_ort_vect_ov, + f_ort_ov_bind_eager, f_ort_ov_bind, + f_ort_ov_eager_gpu, f_ort_ov_gpu, + f_ort_vect_ov_eager_gpu, f_ort_vect_ov_gpu, + f_ort_ov_bind_eager_gpu, f_ort_ov_bind_gpu, + ] data = [] for N in tqdm([1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000]): @@ -323,26 +377,27 @@ def benchmark(repeat=100): device_gpu = C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0) Xov_gpu = C_OrtValue.ortvalue_from_numpy(X, device_gpu) + r = min(500, int(repeat / N)) for f in fcts: if f is None: continue obs = {'name': f.__doc__, "N": N} if "-gpu" in f.__doc__: begin = time.perf_counter() - for r in range(repeat): + for r in range(r): _ = f(Xov_gpu) end = time.perf_counter() - begin elif "-ov" in f.__doc__: begin = time.perf_counter() - for r in range(repeat): + for r in range(r): _ = f(Xov) end = time.perf_counter() - begin else: begin = time.perf_counter() - for r in range(repeat): + for r in range(r): _ = f(X) end = time.perf_counter() - begin - obs['time'] = end / repeat + obs['time'] = end / r data.append(obs) return pandas.DataFrame(data) @@ -359,38 +414,50 @@ def benchmark(repeat=100): def make_graph(df): - def subgraph(row, cols, title): + def subgraph(row, cols): if "numpy" not in cols: cols.append("numpy") piv = piv_all[cols].copy() - piv.plot(ax=ax[row, 0], title=title, logy=True, logx=True) + piv.plot(ax=ax[row, 0], + title="Time execution(s)" if row == 0 else "", + logy=True, logx=True) piv2 = piv / piv.index.values.reshape((-1, 1)) - piv2.plot(ax=ax[row, 1], title=f"Time(s) per execution / N", logx=True) + piv2.plot(ax=ax[row, 1], + title=f"Time(s) per execution / N" if row == 0 else "", + logx=True) piv3 = piv / piv["numpy"].values.reshape((-1, 1)) - piv3.plot(ax=ax[row, 2], title="Ratio against numpy", + piv3.plot(ax=ax[row, 2], + title="Ratio against numpy" if row == 0 else "", logy=True, logx=True) for j in range(0, 3): ax[row, j].legend(fontsize="x-small") - fig, ax = plt.subplots(3, 3, figsize=(12, 8)) + fig, ax = plt.subplots(5, 3, figsize=(15, 9)) fig.suptitle("Time execution Eager Add + Add - lower is better") piv_all = df.pivot(index="N", columns="name", values="time") - # no gpu, no vect + # no gpu, no vect, no bind subgraph(0, [c for c in piv_all.columns - if "-gpu" not in c and "-vect" not in c], - title="CPU") + if "-gpu" not in c and "-vect" not in c and "-bind" not in c]) - # no gpu + # no gpu, ov, no bind subgraph(1, [c for c in piv_all.columns - if "-gpu" not in c and "-ov" in c], - title="CPU, OrtValue and OrtValueVector") + if "-gpu" not in c and "-ov" in c and "-bind" not in c]) + + # no gpu, vect or bind + subgraph(2, [c for c in piv_all.columns + if "-gpu" not in c and ("-bind" in c or '-vect' in c)]) + + # gpu, no bind + cols = [c for c in piv_all.columns + if "-gpu" in c and "-ov" in c and "-bind" not in c] + subgraph(3, cols) - # gpu - cols = [c for c in piv_all.columns if "-gpu" in c and "-ov" in c] - subgraph(2, cols, - title="GPU, OrtValue and OrtValueVector") + # gpu, vect or bind + cols = [c for c in piv_all.columns + if "-gpu" in c and ("-bind" in c or '-vect' in c)] + subgraph(4, cols) fig.savefig("eager_mode_cpu.png" if len(cols) == 0 else "eager_mode_gpu.png", dpi=250) return fig, ax @@ -408,7 +475,8 @@ def subgraph(row, cols, title): # Eager mode must use :epkg:`OrtValue`. It is faster and it reduces the differences # between using two additions in a single graph or two graphs of a single addition # on CPU. On GPU, it is still faster but eager mode is slighly slower with -# method `run_with_ortvaluevector`. +# method `run_with_ortvaluevector` or `run_with_iobinding`. Both +# methods show similar performances. # # However, method `run_with_ort_values` is not recommended # because the output device cannot be specified. Therefore, From 5e7e3c52b7d7bbc24db08fe9c5776d47b390669e Mon Sep 17 00:00:00 2001 From: xadupre Date: Tue, 29 Nov 2022 10:52:52 +0100 Subject: [PATCH 09/10] update --- .gitignore | 1 + _doc/examples/plot_benchmark_eager_mode.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 2441bb9..3efae15 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,4 @@ _doc/examples/ort_cpu_ortvalue.csv _unittests/ut_documentation/data _unittests/ut_documentation/ort_*.csv _doc/examples/*splits*.png +_doc/examples/eager*.png diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index 74ea2bd..b0e246c 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -217,7 +217,6 @@ def f_ort_vect_ov(X): # If GPU is available. if sess_add_gpu is not None: - # def f_ort_ov_eager_gpu(X): "ort-ov-eager-gpu" @@ -289,6 +288,8 @@ def f_ort_vect_ov_gpu(X): f_ort_ov_gpu = None f_ort_vect_ov_eager_gpu = None f_ort_vect_ov_gpu = None + f_ort_ov_bind_eager_gpu = None + f_ort_ov_bind_gpu = None ####################################### @@ -333,9 +334,13 @@ def f_ort_vect_ov_gpu(X): except RuntimeError: # cuda is not available sess_add_gpu = None - sess_add2_gpu + sess_add2_gpu = None f_ort_ov_eager_gpu = None f_ort_ov_gpu = None + f_ort_ov_bind_eager_gpu = None + f_ort_ov_bind_gpu = None + f_ort_vect_ov_eager_gpu = None + f_ort_vect_ov_gpu = None results = [] for fct, x in Ys: From ad2be1106a689781fc826cc0d541c3815504c79b Mon Sep 17 00:00:00 2001 From: xadupre Date: Tue, 29 Nov 2022 11:21:25 +0100 Subject: [PATCH 10/10] Update plot_benchmark_eager_mode.py --- _doc/examples/plot_benchmark_eager_mode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index b0e246c..d82fd1b 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -428,7 +428,7 @@ def subgraph(row, cols): logy=True, logx=True) piv2 = piv / piv.index.values.reshape((-1, 1)) piv2.plot(ax=ax[row, 1], - title=f"Time(s) per execution / N" if row == 0 else "", + title="Time(s) per execution / N" if row == 0 else "", logx=True) piv3 = piv / piv["numpy"].values.reshape((-1, 1)) piv3.plot(ax=ax[row, 2],