From 42f48e90ef22f2ee4ced6bc981f38f88600201d8 Mon Sep 17 00:00:00 2001 From: xadupre Date: Fri, 25 Nov 2022 17:03:37 +0100 Subject: [PATCH 1/4] Adds an example to benchmark a kind of eager mode --- _doc/examples/plot_benchmark_eager_mode.py | 220 ++++++++++++++++++ _doc/examples/plot_benchmark_onnx_function.py | 4 +- _doc/examples/plot_benchmark_ort_api.py | 15 +- _doc/sphinxdoc/source/conf.py | 6 +- .../tutorial_bench/tutorial_benchmark.rst | 1 + 5 files changed, 231 insertions(+), 15 deletions(-) create mode 100644 _doc/examples/plot_benchmark_eager_mode.py diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py new file mode 100644 index 00000000..94811aa5 --- /dev/null +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -0,0 +1,220 @@ +""" +.. _benchmark-ort-eager-mode: + +Benchmark onnxruntime API: eager mode +===================================== + +epkg:`pytorch` or :epkg:`tensorflow` usually work faster if the +deep learning model is entirely run outside python. The python code +is only used to build the model but is then used to call the +execution of the whole. In that configuration, there is no way +to look into intermediate results. + +It does not make it easy to debug or investigate what is going on. +What the user writes is not what is executed. +Eager mode is an expression which defines a situation where +the code which defines the model is the same as the used to +execute the model. Everything happens in python. It is slower +but the gap is small if the model manipulate big matrices. + +It is possible to do the same with :epkg:`onnxruntime`. +This example compares the performance of a couple of +scenarios. This work is close to what is done in example +:ref:`benchmark-ort-api`. + +.. contents:: + :local: + +The scenario +++++++++++++ + +We would like to compare two codes. The first one +executes 2 additions in a single onnx graph. The second +one executes 10 additions, each of them calling :epkg:`onnxruntime` +for a single addition. + +""" +import time +import numpy +from numpy.testing import assert_allclose +import pandas +import matplotlib.pyplot as plt +from tqdm import tqdm +from onnx import TensorProto +from onnx.numpy_helper import from_array +from onnx.helper import ( + make_model, make_node, set_model_props, make_tensor, + make_graph, make_tensor_value_info) +from onnxruntime import InferenceSession, __version__ as ort_version +from onnxruntime.capi._pybind_state import ( # pylint: disable=E0611 + SessionIOBinding, OrtDevice as C_OrtDevice, + OrtMemType, OrtValue as C_OrtValue, RunOptions) +from cpyquickhelper.numbers.speed_measure import measure_time +from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation + +############################################ +# Available optimisation on this machine. + +print(code_optimisation()) +repeat = 250 +number = 250 + +############################################ +# A single addition of a matrix of two dimension. + +CST = numpy.array(list(range(100))).reshape(1, -1).astype(numpy.float32) +X = make_tensor_value_info('X', TensorProto.FLOAT, [None, CST.shape[1]]) +Z = make_tensor_value_info('Z', TensorProto.FLOAT, [None, CST.shape[1]]) + +graph = make_graph([ + make_node("Add", ['X', 'Y'], ['Z']), +], '', [X], [Z], [ + from_array(CST, name='Y'), +]) +onnx_add = make_model(graph) +sess_add = InferenceSession(onnx_add.SerializeToString(), + providers=["CPUExecutionProvider"]) + +############################################# +# Two additions of the same matrix. + +graph = make_graph([ + make_node("Add", ['X', 'Y'], ['T']), + make_node("Add", ['T', 'Y'], ['Z']), +], '', [X], [Z], [ + from_array(CST, 'Y'), +]) +onnx_add2 = make_model(graph) +sess_add2 = InferenceSession(onnx_add2.SerializeToString(), + providers=["CPUExecutionProvider"]) + +############################################ +# The functions to test +# +++++++++++++++++++++ +# +# * `numpy`: :epkg:`numpy` +# * `ort`: :epkg:`onnxruntime` + numpy array as input +# * `ort-ov`: :epkg:`onnxruntime` + :epkg:`C_OrtValue` as input + + +def f_numpy(X): + "numpy" + T = X + CST + Z = T + CST + return Z + + +def f_ort_eager(X): + "ort-eager" + T = sess_add._sess.run(['Z'], {'X': X}, None)[0] + Z = sess_add._sess.run(['Z'], {'X': T}, None)[0] + return Z + + +def f_ort(X): + "ort" + Z = sess_add2._sess.run(['Z'], {'X': X}, None)[0] + return Z + + +def f_ort_ov_eager(X): + "ort-ov-eager" + T = sess_add._sess.run_with_ort_values({'X': X}, ['Z'], None)[0] + Z = sess_add._sess.run_with_ort_values({'X': T}, ['Z'], None)[0] + return Z + + +def f_ort_ov(X): + "ort-ov" + Z = sess_add2._sess.run_with_ort_values({'X': X}, ['Z'], None)[0] + return Z + + +X = numpy.random.rand(10, CST.shape[1]).astype(CST.dtype) + +device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0) +Xov = C_OrtValue.ortvalue_from_numpy(X, device) + +Ys = [ + f_numpy(X), + f_ort_eager(X), + f_ort(X), + f_ort_ov_eager(Xov), + f_ort_ov(Xov), +] + +for i in range(1, len(Ys)): + try: + assert_allclose(Ys[0], Ys[i]) + except TypeError: + # OrtValue + assert_allclose(Ys[0], Ys[i].numpy()) + +########################################## +# All outputs are the same. + +############################## +# Benchmark the functions +# +++++++++++++++++++++++ + + +def benchmark(repeat=100): + fcts = [f_numpy, f_ort_eager, f_ort, f_ort_ov_eager, f_ort_ov] + data = [] + for N in tqdm([1, 2, 5, 10, 20, 50, 100, 200, 500, + 1000, 2000, 5000, 10000, 20000]): + X = numpy.random.rand(N, CST.shape[1]).astype(CST.dtype) + device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0) + Xov = C_OrtValue.ortvalue_from_numpy(X, device) + + for f in fcts: + obs = {'name': f.__doc__, "N": N} + if "-ov" in f.__doc__: + begin = time.perf_counter() + for r in range(repeat): + _ = f(Xov) + end = time.perf_counter() - begin + else: + begin = time.perf_counter() + for r in range(repeat): + _ = f(X) + end = time.perf_counter() - begin + obs['time'] = end / repeat + data.append(obs) + + return pandas.DataFrame(data) + + +df = benchmark() +df + + +######################################## +# Graphs +# ++++++ + +fig, ax = plt.subplots(1, 3, figsize=(12, 4)) + +piv = df.pivot(index="N", columns="name", values="time") +piv.plot(ax=ax[0], title="Time(s) per execution", logy=True, logx=True) +piv2 = piv / piv.index.values.reshape((-1, 1)) +piv2.plot(ax=ax[1], title="Time(s) per execution / N", logx=True) +piv3 = piv / piv["numpy"].values.reshape((-1, 1)) +piv3.plot(ax=ax[2], title="Ratio against numpy (lower is better)", + logy=True, logx=True) + + +################################### +# Conclusion +# ++++++++++ +# +# The eager mode is slower than numpy for small arrays than is faster. +# This is probably due to :epkg:`pybind11` binding when numpy +# is using the direct python API. This could be improved by using :epkg:`cython`. +# Eager mode must use :epkg:`OrtValue`. It is faster and it reduces the differences +# between using two additions in a single graph or two graphs of a single addition. + +print(f"onnxruntime.__version__ = {ort_version!r}") + + +plt.show() diff --git a/_doc/examples/plot_benchmark_onnx_function.py b/_doc/examples/plot_benchmark_onnx_function.py index 64912233..a38c8096 100644 --- a/_doc/examples/plot_benchmark_onnx_function.py +++ b/_doc/examples/plot_benchmark_onnx_function.py @@ -19,7 +19,7 @@ method `onnxruntime.InferenceSession.run` * `bind`: inference through an ONNX graph executed with method `onnxruntime.InferenceSession.run_with_iobinding` -* `run`: inference through an ONNX graph executed with +* `inplace`: inference through an ONNX graph executed with method `onnxruntime.InferenceSession.run_with_iobinding` but without counting the binding assuming input buffers are reused and do not need binding again @@ -114,7 +114,7 @@ def benchmark(name, onx, fct_numpy, *args, ms = measure_time( lambda: nobind_just_run(sess._sess, bind)) - ms.update(dict(name=name, impl='run', dim=dim)) + ms.update(dict(name=name, impl='inplace', dim=dim)) rows.append(ms) return rows diff --git a/_doc/examples/plot_benchmark_ort_api.py b/_doc/examples/plot_benchmark_ort_api.py index 3e35d8e8..3bc234d0 100644 --- a/_doc/examples/plot_benchmark_ort_api.py +++ b/_doc/examples/plot_benchmark_ort_api.py @@ -1,8 +1,8 @@ """ .. _benchmark-ort-api: -Benchmark onnxruntime API: run or ... -===================================== +Benchmark onnxruntime API: run or run_with_ort_values +===================================================== This short code compares different methods to call onnxruntime API. @@ -17,7 +17,6 @@ py-spy record -o plot_benchmark_ort_api.svg -r 10 --native -- python plot_benchmark_ort_api.py - .. contents:: :local: @@ -101,12 +100,12 @@ obs = measure_time(lambda: sess.run(None, {'X': X}), context=dict(sess=sess, X=X), repeat=repeat, number=number) -obs['name'] = 'ort-run' +obs['name'] = 'ort' data.append(obs) ################################### -# onnxruntime: run +# onnxruntime: run from C API print('ort-c') sess = InferenceSession(onx.SerializeToString(), providers=['CPUExecutionProvider']) @@ -121,7 +120,7 @@ ################################### -# onnxruntime: run_with_ort_values +# onnxruntime: run_with_ort_values from C API print('ort-ov-c') device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0) @@ -136,12 +135,12 @@ {'X': Xov}, output_names, ro), context=dict(sess=sess), repeat=repeat, number=number) -obs['name'] = 'ort-ov' +obs['name'] = 'ort-ov-c' data.append(obs) ################################### -# onnxruntime: run_with_iobinding +# onnxruntime: run_with_iobinding from C API print('ort-bind') sess = InferenceSession(onx.SerializeToString(), providers=['CPUExecutionProvider']) diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/sphinxdoc/source/conf.py index 1f1e7268..76043281 100644 --- a/_doc/sphinxdoc/source/conf.py +++ b/_doc/sphinxdoc/source/conf.py @@ -26,7 +26,7 @@ def callback_begin(): os.makedirs(dest) for img in os.listdir(source): ext = os.path.splitext(img)[-1] - if ext not in {'.png', '.jpg'}: + if ext not in {'.png', '.jpg', '.svg'}: continue shutil.copy(os.path.join(source, img), dest) @@ -54,11 +54,7 @@ def callback_begin(): } blog_root = "http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/" - -html_css_files = ['my-styles.css'] - html_logo = "phdoc_static/project_ico.png" -html_sidebars = {} language = "en" onnx_doc_folder = os.path.join(os.path.dirname(__file__), 'api', 'onnxops') mathdef_link_only = True diff --git a/_doc/sphinxdoc/source/tutorials/tutorial_bench/tutorial_benchmark.rst b/_doc/sphinxdoc/source/tutorials/tutorial_bench/tutorial_benchmark.rst index 196442e8..cf7976fa 100644 --- a/_doc/sphinxdoc/source/tutorials/tutorial_bench/tutorial_benchmark.rst +++ b/_doc/sphinxdoc/source/tutorials/tutorial_bench/tutorial_benchmark.rst @@ -8,4 +8,5 @@ Inference ../../gyexamples/plot_benchmark_ort_api ../../gyexamples/plot_benchmark_inference_standard ../../gyexamples/plot_benchmark_inference + ../../gyexamples/plot_benchmark_eager_mode ../../gyexamples/plot_benchmark_graph_opt From 6b87d82190b14a737b36f388d07529e92d751284 Mon Sep 17 00:00:00 2001 From: xadupre Date: Sun, 27 Nov 2022 22:34:06 +0100 Subject: [PATCH 2/4] lint --- _doc/examples/plot_benchmark_eager_mode.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index 94811aa5..aecb1e2c 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -43,13 +43,12 @@ from onnx import TensorProto from onnx.numpy_helper import from_array from onnx.helper import ( - make_model, make_node, set_model_props, make_tensor, + make_model, make_node, make_graph, make_tensor_value_info) from onnxruntime import InferenceSession, __version__ as ort_version from onnxruntime.capi._pybind_state import ( # pylint: disable=E0611 - SessionIOBinding, OrtDevice as C_OrtDevice, - OrtMemType, OrtValue as C_OrtValue, RunOptions) -from cpyquickhelper.numbers.speed_measure import measure_time + OrtDevice as C_OrtDevice, + OrtMemType, OrtValue as C_OrtValue) from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation ############################################ From 693c598489969608a053b8a0d4b860f5dd3e05cd Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Mon, 28 Nov 2022 13:25:32 +0000 Subject: [PATCH 3/4] add gpu --- _doc/examples/data/eager_mode.csv | 99 +++++++++++++++++++ _doc/examples/plot_benchmark_eager_mode.py | 107 ++++++++++++++++++--- 2 files changed, 191 insertions(+), 15 deletions(-) create mode 100644 _doc/examples/data/eager_mode.csv diff --git a/_doc/examples/data/eager_mode.csv b/_doc/examples/data/eager_mode.csv new file mode 100644 index 00000000..e953883c --- /dev/null +++ b/_doc/examples/data/eager_mode.csv @@ -0,0 +1,99 @@ +name,N,time +numpy,1,1.6239401884377004e-06 +ort-eager,1,1.2923539616167545e-05 +ort,1,7.524730172008276e-06 +ort-ov-eager,1,1.0392629774287342e-05 +ort-ov,1,6.243779789656401e-06 +ort-ov-eager-gpu,1,6.998750963248313e-05 +ort-ov-gpu,1,3.477875958196819e-05 +numpy,2,3.044890472665429e-06 +ort-eager,2,1.3051539426669478e-05 +ort,2,8.713690331205726e-06 +ort-ov-eager,2,1.1555589735507964e-05 +ort-ov,2,7.138750515878201e-06 +ort-ov-eager-gpu,2,7.208543014712632e-05 +ort-ov-gpu,2,3.6394710186868905e-05 +numpy,5,3.831860376521945e-06 +ort-eager,5,1.4171500224620103e-05 +ort,5,8.038709638640285e-06 +ort-ov-eager,5,1.1525589507073164e-05 +ort-ov,5,7.319740252569317e-06 +ort-ov-eager-gpu,5,7.189344032667577e-05 +ort-ov-gpu,5,3.57317307498306e-05 +numpy,10,4.102849634364247e-06 +ort-eager,10,1.4772480353713036e-05 +ort,10,8.412699680775403e-06 +ort-ov-eager,10,1.1969569604843856e-05 +ort-ov,10,7.66773009672761e-06 +ort-ov-eager-gpu,10,7.502933032810688e-05 +ort-ov-gpu,10,3.6292700096964834e-05 +numpy,20,5.108820041641593e-06 +ort-eager,20,1.6318419948220253e-05 +ort,20,9.663649834692478e-06 +ort-ov-eager,20,1.3151530874893069e-05 +ort-ov,20,8.747689425945282e-06 +ort-ov-eager-gpu,20,7.586929947137832e-05 +ort-ov-gpu,20,3.6894690711051223e-05 +numpy,50,8.849690202623605e-06 +ort-eager,50,2.1720220101997255e-05 +ort,50,1.3897509779781103e-05 +ort-ov-eager,50,1.6557410126551987e-05 +ort-ov,50,1.1749580735340715e-05 +ort-ov-eager-gpu,50,8.735888986848295e-05 +ort-ov-gpu,50,3.979059052653611e-05 +numpy,100,1.4907469740137458e-05 +ort-eager,100,2.8627979336306453e-05 +ort,100,2.0100290421396493e-05 +ort-ov-eager,100,2.2381199523806573e-05 +ort-ov,100,1.7124389996752145e-05 +ort-ov-eager-gpu,100,9.972644969820977e-05 +ort-ov-gpu,100,4.341945983469486e-05 +numpy,200,2.7075030375272035e-05 +ort-eager,200,4.3370459461584684e-05 +ort,200,3.119189059361815e-05 +ort-ov-eager,200,3.3285809913650154e-05 +ort-ov,200,2.6379060000181197e-05 +ort-ov-eager-gpu,200,0.0001272184809204191 +ort-ov-gpu,200,5.1265170332044365e-05 +numpy,500,6.265176925808191e-05 +ort-eager,500,8.758387994021178e-05 +ort,500,6.585365976206958e-05 +ort-ov-eager,500,5.8810909977182745e-05 +ort-ov,500,5.206315079703927e-05 +ort-ov-eager-gpu,500,0.0001980439608450979 +ort-ov-gpu,500,7.229942944832146e-05 +numpy,1000,0.00012052271980792284 +ort-eager,1000,0.00020487471017986537 +ort,1000,0.00010618122993037104 +ort-ov-eager,1000,7.917118025943637e-05 +ort-ov,1000,9.659656090661884e-05 +ort-ov-eager-gpu,1000,0.0003143318207003176 +ort-ov-gpu,1000,0.00010537925059907138 +numpy,2000,0.0002533179905731231 +ort-eager,2000,0.00042499787989072504 +ort,2000,0.0003522354701999575 +ort-ov-eager,2000,0.0001527195703238249 +ort-ov,2000,0.0001286304194945842 +ort-ov-eager-gpu,2000,0.0005476115201599896 +ort-ov-gpu,2000,0.00017218387103639542 +numpy,5000,0.0006107362709008157 +ort-eager,5000,0.0007773243507836014 +ort,5000,0.0005611650308128447 +ort-ov-eager,5000,9.656556067056954e-05 +ort-ov,5000,0.000121245690388605 +ort-ov-eager-gpu,5000,0.0012037401704583317 +ort-ov-gpu,5000,0.00034049289068207146 +numpy,10000,0.0011586127802729607 +ort-eager,10000,0.0016428445605561138 +ort,10000,0.0008303114597219974 +ort-ov-eager,10000,0.0003013462794478983 +ort-ov,10000,0.0004336395696736872 +ort-ov-eager-gpu,10000,0.0021700217993929983 +ort-ov-gpu,10000,0.0007736664800904691 +numpy,20000,0.0026089051889721304 +ort-eager,20000,0.003720655629877001 +ort,20000,0.002414294109912589 +ort-ov-eager,20000,0.0006073223997373134 +ort-ov,20000,0.0005382718495093287 +ort-ov-eager-gpu,20000,0.004377091269707307 +ort-ov-gpu,20000,0.0014389527996536344 diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index aecb1e2c..92196e76 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -45,7 +45,8 @@ from onnx.helper import ( make_model, make_node, make_graph, make_tensor_value_info) -from onnxruntime import InferenceSession, __version__ as ort_version +from onnxruntime import ( + get_all_providers, InferenceSession, __version__ as ort_version) from onnxruntime.capi._pybind_state import ( # pylint: disable=E0611 OrtDevice as C_OrtDevice, OrtMemType, OrtValue as C_OrtValue) @@ -87,6 +88,20 @@ sess_add2 = InferenceSession(onnx_add2.SerializeToString(), providers=["CPUExecutionProvider"]) +############################################ +# Let's consider GPU as well. + +has_cuda = "CUDAExecutionProvider" in get_all_providers() +if has_cuda: + sess_add_gpu = InferenceSession(onnx_add.SerializeToString(), + providers=["CUDAExecutionProvider"]) + sess_add2_gpu = InferenceSession(onnx_add2.SerializeToString(), + providers=["CUDAExecutionProvider"]) +else: + print("No GPU or one GPU was detected.") + sess_add_gpu = None + sess_add2_gpu = None + ############################################ # The functions to test # +++++++++++++++++++++ @@ -129,6 +144,24 @@ def f_ort_ov(X): return Z +if sess_add_gpu is not None: + + def f_ort_ov_eager_gpu(X): + "ort-ov-eager-gpu" + T = sess_add_gpu._sess.run_with_ort_values({'X': X}, ['Z'], None)[0] + Z = sess_add_gpu._sess.run_with_ort_values({'X': T}, ['Z'], None)[0] + return Z + + + def f_ort_ov_gpu(X): + "ort-ov-gpu" + Z = sess_add2_gpu._sess.run_with_ort_values({'X': X}, ['Z'], None)[0] + return Z + +else: + f_ort_ov_eager_gpu = None + f_ort_ov_gpu = None + X = numpy.random.rand(10, CST.shape[1]).astype(CST.dtype) device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0) @@ -141,6 +174,13 @@ def f_ort_ov(X): f_ort_ov_eager(Xov), f_ort_ov(Xov), ] +if sess_add_gpu is not None: + device_gpu = C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0) + Xov_gpu = C_OrtValue.ortvalue_from_numpy(X, device_gpu) + Ys.extend([ + f_ort_ov_eager_gpu(Xov_gpu), + f_ort_ov_gpu(Xov_gpu), + ]) for i in range(1, len(Ys)): try: @@ -158,17 +198,28 @@ def f_ort_ov(X): def benchmark(repeat=100): - fcts = [f_numpy, f_ort_eager, f_ort, f_ort_ov_eager, f_ort_ov] + fcts = [f_numpy, f_ort_eager, f_ort, f_ort_ov_eager, f_ort_ov, + f_ort_ov_eager_gpu, f_ort_ov_gpu] data = [] for N in tqdm([1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000]): X = numpy.random.rand(N, CST.shape[1]).astype(CST.dtype) device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0) Xov = C_OrtValue.ortvalue_from_numpy(X, device) + if f_ort_ov_gpu is not None: + device_gpu = C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0) + Xov_gpu = C_OrtValue.ortvalue_from_numpy(X, device_gpu) for f in fcts: + if f is None: + continue obs = {'name': f.__doc__, "N": N} - if "-ov" in f.__doc__: + if "-gpu" in f.__doc__: + begin = time.perf_counter() + for r in range(repeat): + _ = f(Xov_gpu) + end = time.perf_counter() - begin + elif "-ov" in f.__doc__: begin = time.perf_counter() for r in range(repeat): _ = f(Xov) @@ -185,6 +236,7 @@ def benchmark(repeat=100): df = benchmark() +df.to_csv("plot_benchmark_eager_mode.csv", index=False) df @@ -192,28 +244,53 @@ def benchmark(repeat=100): # Graphs # ++++++ -fig, ax = plt.subplots(1, 3, figsize=(12, 4)) +def make_graph(df): + fig, ax = plt.subplots(2, 3, figsize=(12, 8)) + + piv_all = df.pivot(index="N", columns="name", values="time") + + # no gpu + piv = piv_all[[c for c in piv_all.columns if "gpu" not in c]].copy() + piv.plot(ax=ax[0, 0], title="Time(s) per execution", logy=True, logx=True) + piv2 = piv / piv.index.values.reshape((-1, 1)) + piv2.plot(ax=ax[0, 1], title="Time(s) per execution / N", logx=True) + piv3 = piv / piv["numpy"].values.reshape((-1, 1)) + piv3.plot(ax=ax[0, 2], title="Ratio against numpy (lower is better)", + logy=True, logx=True) -piv = df.pivot(index="N", columns="name", values="time") -piv.plot(ax=ax[0], title="Time(s) per execution", logy=True, logx=True) -piv2 = piv / piv.index.values.reshape((-1, 1)) -piv2.plot(ax=ax[1], title="Time(s) per execution / N", logx=True) -piv3 = piv / piv["numpy"].values.reshape((-1, 1)) -piv3.plot(ax=ax[2], title="Ratio against numpy (lower is better)", - logy=True, logx=True) + # ort value + piv = piv_all[[c for c in piv_all.columns if "ov" in c or "numpy" in c]].copy() + piv.plot(ax=ax[1, 0], title="Time(s) per execution", logy=True, logx=True) + piv2 = piv / piv.index.values.reshape((-1, 1)) + piv2.plot(ax=ax[1, 1], title="Time(s) per execution / N", logx=True) + piv3 = piv / piv["numpy"].values.reshape((-1, 1)) + piv3.plot(ax=ax[1, 2], title="Ratio against numpy (lower is better)", + logy=True, logx=True) + return fig, ax +fig, ax = make_graph(df) + ################################### # Conclusion # ++++++++++ # -# The eager mode is slower than numpy for small arrays than is faster. +# The eager mode is slower than numpy for small arrays then is faster. # This is probably due to :epkg:`pybind11` binding when numpy # is using the direct python API. This could be improved by using :epkg:`cython`. # Eager mode must use :epkg:`OrtValue`. It is faster and it reduces the differences -# between using two additions in a single graph or two graphs of a single addition. +# between using two additions in a single graph or two graphs of a single addition +# on CPU. On GPU, it is still faster but eager mode is significantly slower. -print(f"onnxruntime.__version__ = {ort_version!r}") +if not has_cuda: + print("With GPU") + df = pandas.read_csv("data/eager_mode.csv") + _, ax = make_graph(df) +else: + ax = None +ax +print(f"onnxruntime.__version__ = {ort_version!r}") -plt.show() +fig.savefig("eager.png") +# plt.show() From 2780900be4a18bb36fbee4d74d909871e5def8c7 Mon Sep 17 00:00:00 2001 From: xadupre Date: Mon, 28 Nov 2022 14:35:42 +0100 Subject: [PATCH 4/4] Update plot_benchmark_eager_mode.py --- _doc/examples/plot_benchmark_eager_mode.py | 26 ++++++++++++++-------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index 92196e76..ab011367 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -152,7 +152,6 @@ def f_ort_ov_eager_gpu(X): Z = sess_add_gpu._sess.run_with_ort_values({'X': T}, ['Z'], None)[0] return Z - def f_ort_ov_gpu(X): "ort-ov-gpu" Z = sess_add2_gpu._sess.run_with_ort_values({'X': X}, ['Z'], None)[0] @@ -176,11 +175,18 @@ def f_ort_ov_gpu(X): ] if sess_add_gpu is not None: device_gpu = C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0) - Xov_gpu = C_OrtValue.ortvalue_from_numpy(X, device_gpu) - Ys.extend([ - f_ort_ov_eager_gpu(Xov_gpu), - f_ort_ov_gpu(Xov_gpu), - ]) + try: + Xov_gpu = C_OrtValue.ortvalue_from_numpy(X, device_gpu) + Ys.extend([ + f_ort_ov_eager_gpu(Xov_gpu), + f_ort_ov_gpu(Xov_gpu), + ]) + except RuntimeError: + # cuda is not available + sess_add_gpu = None + sess_add2_gpu + f_ort_ov_eager_gpu = None + f_ort_ov_gpu = None for i in range(1, len(Ys)): try: @@ -256,7 +262,7 @@ def make_graph(df): piv2.plot(ax=ax[0, 1], title="Time(s) per execution / N", logx=True) piv3 = piv / piv["numpy"].values.reshape((-1, 1)) piv3.plot(ax=ax[0, 2], title="Ratio against numpy (lower is better)", - logy=True, logx=True) + logy=True, logx=True) # ort value piv = piv_all[[c for c in piv_all.columns if "ov" in c or "numpy" in c]].copy() @@ -265,7 +271,7 @@ def make_graph(df): piv2.plot(ax=ax[1, 1], title="Time(s) per execution / N", logx=True) piv3 = piv / piv["numpy"].values.reshape((-1, 1)) piv3.plot(ax=ax[1, 2], title="Ratio against numpy (lower is better)", - logy=True, logx=True) + logy=True, logx=True) return fig, ax @@ -290,7 +296,9 @@ def make_graph(df): ax = None ax +###################################### +# Results obtained with the following version. + print(f"onnxruntime.__version__ = {ort_version!r}") -fig.savefig("eager.png") # plt.show()