diff --git a/.gitignore b/.gitignore index 2441bb9..3efae15 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,4 @@ _doc/examples/ort_cpu_ortvalue.csv _unittests/ut_documentation/data _unittests/ut_documentation/ort_*.csv _doc/examples/*splits*.png +_doc/examples/eager*.png diff --git a/_doc/examples/data/plot_benchmark_eager_mode.csv b/_doc/examples/data/plot_benchmark_eager_mode.csv new file mode 100644 index 0000000..53be524 --- /dev/null +++ b/_doc/examples/data/plot_benchmark_eager_mode.csv @@ -0,0 +1,211 @@ +name,N,time +numpy,1,1.2711102034680112e-06 +ort-eager,1,1.13358634176204e-05 +ort,1,6.808883180725622e-06 +ort-ov-eager,1,1.1388225820771748e-05 +ort-ov,1,6.520444517861111e-06 +ort-vect-ov-eager,1,1.2170951596653534e-05 +ort-vect-ov,1,8.38620085684813e-06 +ort-ov-bind-eager,1,1.3180973615906224e-05 +ort-ov-bind,1,8.696321750822233e-06 +ort-ov-eager-gpu,1,7.056212242768735e-05 +ort-ov-gpu,1,3.481692618334281e-05 +ort-vect-ov-eager-gpu,1,3.555464748193922e-05 +ort-vect-ov-gpu,1,2.4811498815044608e-05 +ort-ov-bind-eager-gpu,1,4.045496078652671e-05 +ort-ov-bind-gpu,1,2.7586736226665607e-05 +numpy,2,2.734601275551176e-06 +ort-eager,2,1.2075403671577034e-05 +ort,2,7.642062485847675e-06 +ort-ov-eager,2,1.221340531746166e-05 +ort-ov,2,7.38951311719538e-06 +ort-vect-ov-eager,2,1.2355767047175994e-05 +ort-vect-ov,2,9.376237369151918e-06 +ort-ov-bind-eager,2,1.3248451201744923e-05 +ort-ov-bind,2,9.651492869523173e-06 +ort-ov-eager-gpu,2,7.31769550059523e-05 +ort-ov-gpu,2,3.5939188217001456e-05 +ort-vect-ov-eager-gpu,2,3.82672866985782e-05 +ort-vect-ov-gpu,2,3.82050041403828e-05 +ort-ov-bind-eager-gpu,2,3.984901017352264e-05 +ort-ov-bind-gpu,2,2.7437667967271557e-05 +numpy,5,3.371258473438108e-06 +ort-eager,5,1.315288568274623e-05 +ort,5,7.733609667891108e-06 +ort-ov-eager,5,1.1391451651786244e-05 +ort-ov,5,7.110731346965438e-06 +ort-vect-ov-eager,5,1.2713854214334899e-05 +ort-vect-ov,5,8.936898459814504e-06 +ort-ov-bind-eager,5,1.3004758204068473e-05 +ort-ov-bind,5,9.634384796281332e-06 +ort-ov-eager-gpu,5,7.204147345595518e-05 +ort-ov-gpu,5,3.4798317110824925e-05 +ort-vect-ov-eager-gpu,5,3.641794677098572e-05 +ort-vect-ov-gpu,5,2.5500802780277797e-05 +ort-ov-bind-eager-gpu,5,3.897886624590421e-05 +ort-ov-bind-gpu,5,2.6711501113916796e-05 +numpy,10,3.99789578824459e-06 +ort-eager,10,1.2989837375196468e-05 +ort,10,8.63157538790818e-06 +ort-ov-eager,10,1.188740721009972e-05 +ort-ov,10,7.536177843282319e-06 +ort-vect-ov-eager,10,1.2986518222054369e-05 +ort-vect-ov,10,9.170969528794047e-06 +ort-ov-bind-eager,10,1.3924449184394586e-05 +ort-ov-bind,10,1.0086918655638783e-05 +ort-ov-eager-gpu,10,7.289471841246194e-05 +ort-ov-gpu,10,3.498931489656308e-05 +ort-vect-ov-eager-gpu,10,3.650360053679982e-05 +ort-vect-ov-gpu,10,2.4900201075085014e-05 +ort-ov-bind-eager-gpu,10,3.90811277641589e-05 +ort-ov-bind-gpu,10,2.6648614517192252e-05 +numpy,20,5.098671245506269e-06 +ort-eager,20,1.54399778226383e-05 +ort,20,9.723499106239505e-06 +ort-ov-eager,20,1.3414586822862827e-05 +ort-ov,20,8.937757405819314e-06 +ort-vect-ov-eager,20,1.453850413616068e-05 +ort-vect-ov,20,1.0608864048945493e-05 +ort-ov-bind-eager,20,1.5095967328085041e-05 +ort-ov-bind,20,1.1017649669708521e-05 +ort-ov-eager-gpu,20,7.634075496307745e-05 +ort-ov-gpu,20,3.63207728828633e-05 +ort-vect-ov-eager-gpu,20,3.689129719487773e-05 +ort-vect-ov-gpu,20,2.45928172943865e-05 +ort-ov-bind-eager-gpu,20,3.898421608087886e-05 +ort-ov-bind-gpu,20,2.8260121573261993e-05 +numpy,50,9.188749538155977e-06 +ort-eager,50,1.9810953781562276e-05 +ort,50,1.3596643890503427e-05 +ort-ov-eager,50,1.6675594745339046e-05 +ort-ov,50,1.19556604668197e-05 +ort-vect-ov-eager,50,1.8200560648235596e-05 +ort-vect-ov,50,1.3913243298719186e-05 +ort-ov-bind-eager,50,1.8310926928434794e-05 +ort-ov-bind,50,1.4871118216568608e-05 +ort-ov-eager-gpu,50,8.813127975112625e-05 +ort-ov-gpu,50,3.9916390796898574e-05 +ort-vect-ov-eager-gpu,50,3.693576212247193e-05 +ort-vect-ov-gpu,50,2.4740043233934736e-05 +ort-ov-bind-eager-gpu,50,3.9929252907388496e-05 +ort-ov-bind-gpu,50,2.6972319681159e-05 +numpy,100,1.5112248534907798e-05 +ort-eager,100,2.7329642718366113e-05 +ort,100,1.9632307746985428e-05 +ort-ov-eager,100,2.2391379095478763e-05 +ort-ov,100,1.6873115156259802e-05 +ort-vect-ov-eager,100,2.6304204571192804e-05 +ort-vect-ov,100,1.8689596296323967e-05 +ort-ov-bind-eager,100,2.377623776390361e-05 +ort-ov-bind,100,1.9281802418245318e-05 +ort-ov-eager-gpu,100,0.00010847913871082117 +ort-ov-gpu,100,4.537494692204859e-05 +ort-vect-ov-eager-gpu,100,3.7859098558298876e-05 +ort-vect-ov-gpu,100,2.502607185493015e-05 +ort-ov-bind-eager-gpu,100,4.014694452891318e-05 +ort-ov-bind-gpu,100,2.7043863977353597e-05 +numpy,200,2.917542704614346e-05 +ort-eager,200,4.151081535441569e-05 +ort,200,3.140525549455308e-05 +ort-ov-eager,200,3.316792525208345e-05 +ort-ov,200,2.6052282897360398e-05 +ort-vect-ov-eager,200,3.334714772805632e-05 +ort-vect-ov,200,2.832363693054924e-05 +ort-ov-bind-eager,200,3.367213000414696e-05 +ort-ov-bind,200,2.900986954031129e-05 +ort-ov-eager-gpu,200,0.00014148382873901602 +ort-ov-gpu,200,6.213097129367551e-05 +ort-vect-ov-eager-gpu,200,3.718186268147814e-05 +ort-vect-ov-gpu,200,2.4685012253127547e-05 +ort-ov-bind-eager-gpu,200,3.9644076090321376e-05 +ort-ov-bind-gpu,200,2.7330868070964348e-05 +numpy,500,6.52488797458863e-05 +ort-eager,500,8.870200997398021e-05 +ort,500,6.801239449073611e-05 +ort-ov-eager,500,6.0614623033606835e-05 +ort-ov,500,5.285868684601302e-05 +ort-vect-ov-eager,500,6.496559108347305e-05 +ort-vect-ov,500,5.4099669294624495e-05 +ort-ov-bind-eager,500,6.085498981571537e-05 +ort-ov-bind,500,5.60753888691455e-05 +ort-ov-eager-gpu,500,0.0002297830632507649 +ort-ov-gpu,500,0.00010358515326009876 +ort-vect-ov-eager-gpu,500,3.7701112708672274e-05 +ort-vect-ov-gpu,500,2.5178429152380514e-05 +ort-ov-bind-eager-gpu,500,4.01915926792862e-05 +ort-ov-bind-gpu,500,2.758014015853405e-05 +numpy,1000,0.00012219335462987602 +ort-eager,1000,0.0001355954237847054 +ort,1000,0.00011039443873431181 +ort-ov-eager,1000,8.403399988310412e-05 +ort-ov,1000,6.696581017376497e-05 +ort-vect-ov-eager,1000,6.78829272345523e-05 +ort-vect-ov,1000,6.176580714617249e-05 +ort-ov-bind-eager,1000,7.399409556026926e-05 +ort-ov-bind,1000,8.521743366293715e-05 +ort-ov-eager-gpu,1000,0.00037906445510571405 +ort-ov-gpu,1000,0.00011412395285309153 +ort-vect-ov-eager-gpu,1000,4.747279511276083e-05 +ort-vect-ov-gpu,1000,2.876006557321218e-05 +ort-ov-bind-eager-gpu,1000,4.042265625333283e-05 +ort-ov-bind-gpu,1000,2.8134765074655568e-05 +numpy,2000,0.0002388510924116914 +ort-eager,2000,0.00044227690687553297 +ort,2000,0.00020837497959660012 +ort-ov-eager,2000,0.00013327595543086043 +ort-ov,2000,0.00010776344534693932 +ort-vect-ov-eager,2000,0.0001355220409125456 +ort-vect-ov,2000,7.620716032499271e-05 +ort-ov-bind-eager,2000,0.00017507457412964056 +ort-ov-bind,2000,7.764993359129954e-05 +ort-ov-eager-gpu,2000,0.0006141934830035704 +ort-ov-gpu,2000,0.00018641628445784667 +ort-vect-ov-eager-gpu,2000,3.88771848275694e-05 +ort-vect-ov-gpu,2000,2.6443645550637557e-05 +ort-ov-bind-eager-gpu,2000,4.167438551815132e-05 +ort-ov-bind-gpu,2000,2.8127812820070603e-05 +numpy,5000,0.0006044526570335482 +ort-eager,5000,0.0008715660814956135 +ort,5000,0.00042021945334932547 +ort-ov-eager,5000,0.0001657667077476314 +ort-ov,5000,0.000347322947345674 +ort-vect-ov-eager,5000,0.00019283137195050083 +ort-vect-ov,5000,0.00017095919352986158 +ort-ov-bind-eager,5000,0.00017653256604124022 +ort-ov-bind,5000,0.00037077100113402684 +ort-ov-eager-gpu,5000,0.0010676379005114236 +ort-ov-gpu,5000,0.00037449517998886244 +ort-vect-ov-eager-gpu,5000,4.251485171897168e-05 +ort-vect-ov-gpu,5000,2.990845970852279e-05 +ort-ov-bind-eager-gpu,5000,4.457795402295021e-05 +ort-ov-bind-gpu,5000,3.0521600616767125e-05 +numpy,10000,0.0011958029187683547 +ort-eager,10000,0.003115397562699703 +ort,10000,0.0012505987010143222 +ort-ov-eager,10000,0.0003016990199482635 +ort-ov,10000,0.0002381296440338095 +ort-vect-ov-eager,10000,0.00032288288507102567 +ort-vect-ov,10000,0.0003793603486143226 +ort-ov-bind-eager,10000,0.000622660714379024 +ort-ov-bind,10000,0.00017427370838094048 +ort-ov-eager-gpu,10000,0.0021718234987929464 +ort-ov-gpu,10000,0.0008711783358683953 +ort-vect-ov-eager-gpu,10000,6.672728953785018e-05 +ort-vect-ov-gpu,10000,4.796367579114598e-05 +ort-ov-bind-eager-gpu,10000,9.261158346715901e-05 +ort-ov-bind-gpu,10000,4.8890200975750176e-05 +numpy,20000,0.0027364153356757015 +ort-eager,20000,0.0067275110429719735 +ort,20000,0.003118208863518455 +ort-ov-eager,20000,0.0005569194742877569 +ort-ov,20000,0.001740831701317802 +ort-vect-ov-eager,20000,0.0009374135284145412 +ort-vect-ov,20000,0.0009072552202269435 +ort-ov-bind-eager,20000,0.0008944365921813776 +ort-ov-bind,20000,0.002242563001345843 +ort-ov-eager-gpu,20000,0.00445372259709984 +ort-ov-gpu,20000,0.001619209784881345 +ort-vect-ov-eager-gpu,20000,9.884369165564959e-05 +ort-vect-ov-gpu,20000,8.044799324125051e-05 +ort-ov-bind-eager-gpu,20000,0.00011459709441458637 +ort-ov-bind-gpu,20000,7.989798905327916e-05 diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py index ab01136..d82fd1b 100644 --- a/_doc/examples/plot_benchmark_eager_mode.py +++ b/_doc/examples/plot_benchmark_eager_mode.py @@ -20,7 +20,8 @@ It is possible to do the same with :epkg:`onnxruntime`. This example compares the performance of a couple of scenarios. This work is close to what is done in example -:ref:`benchmark-ort-api`. +:ref:`benchmark-ort-api`. The example compares the performance +of a couple of methods for CPU and GPU. .. contents:: :local: @@ -46,10 +47,17 @@ make_model, make_node, make_graph, make_tensor_value_info) from onnxruntime import ( - get_all_providers, InferenceSession, __version__ as ort_version) + get_all_providers, InferenceSession, __version__ as ort_version, + RunOptions) from onnxruntime.capi._pybind_state import ( # pylint: disable=E0611 OrtDevice as C_OrtDevice, - OrtMemType, OrtValue as C_OrtValue) + OrtMemType, OrtValue as C_OrtValue, + SessionIOBinding as C_SessionIOBinding) +try: + from onnxruntime.capi._pybind_state import OrtValueVector +except ImportError: + # You need onnxruntime>=1.14 + OrtValueVector = None from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation ############################################ @@ -144,6 +152,70 @@ def f_ort_ov(X): return Z +cpu_device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0) + + +def f_ort_ov_bind_eager(X): + "ort-ov-bind-eager" + bind = C_SessionIOBinding(sess_add._sess) + bind.bind_ortvalue_input("X", X) + bind.bind_output("Z", cpu_device) + sess_add._sess.run_with_iobinding(bind, None) + T = bind.get_outputs()[0] + bind.bind_ortvalue_input("X", T) + sess_add._sess.run_with_iobinding(bind, None) + return bind.get_outputs()[0] + + +def f_ort_ov_bind(X): + "ort-ov-bind" + bind = C_SessionIOBinding(sess_add2._sess) + bind.bind_ortvalue_input("X", X) + bind.bind_output("Z", cpu_device) + sess_add2._sess.run_with_iobinding(bind, None) + return bind.get_outputs()[0] + +####################################### +# onnxruntime >= 1.14 introduces a vector of OrtValues +# to bypass the building of a dictionary. + + +if OrtValueVector is not None: + + run_options = RunOptions() + devices = [C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0)] + + def f_ort_vect_ov_eager(X): + "ort-vect-ov-eager" + vect_in = OrtValueVector() + vect_in.push_back(X) + vect_out = OrtValueVector() + temp_vect_out = OrtValueVector() + sess_add._sess.run_with_ortvaluevector( + run_options, ["X"], vect_in, ["Z"], temp_vect_out, devices) + assert len(temp_vect_out) == 1 + sess_add._sess.run_with_ortvaluevector( + run_options, ["X"], temp_vect_out, ["Z"], vect_out, devices) + assert len(vect_out) == 1 + return vect_out[0] + + def f_ort_vect_ov(X): + "ort-vect-ov" + vect_in = OrtValueVector() + vect_in.push_back(X) + vect_out = OrtValueVector() + sess_add2._sess.run_with_ortvaluevector( + run_options, ["X"], vect_in, ["Z"], vect_out, devices) + assert len(vect_out) == 1 + return vect_out[0] + +else: + f_ort_vect_ov_eager = None + f_ort_vect_ov = None + +######################################### +# If GPU is available. + if sess_add_gpu is not None: def f_ort_ov_eager_gpu(X): @@ -157,9 +229,71 @@ def f_ort_ov_gpu(X): Z = sess_add2_gpu._sess.run_with_ort_values({'X': X}, ['Z'], None)[0] return Z + gpu_device = C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0) + + def f_ort_ov_bind_eager_gpu(X): + "ort-ov-bind-eager-gpu" + bind = C_SessionIOBinding(sess_add_gpu._sess) + bind.bind_ortvalue_input("X", X) + bind.bind_output("Z", gpu_device) + sess_add_gpu._sess.run_with_iobinding(bind, None) + T = bind.get_outputs()[0] + bind.bind_ortvalue_input("X", T) + sess_add_gpu._sess.run_with_iobinding(bind, None) + return bind.get_outputs()[0] + + def f_ort_ov_bind_gpu(X): + "ort-ov-bind-gpu" + bind = C_SessionIOBinding(sess_add2_gpu._sess) + bind.bind_ortvalue_input("X", X) + bind.bind_output("Z", gpu_device) + sess_add2_gpu._sess.run_with_iobinding(bind, None) + return bind.get_outputs()[0] + + if OrtValueVector is not None: + + run_options = RunOptions() + devices = [C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0)] + + def f_ort_vect_ov_eager_gpu(X): + "ort-vect-ov-eager-gpu" + vect_in = OrtValueVector() + vect_in.push_back(X) + vect_out = OrtValueVector() + temp_vect_out = OrtValueVector() + sess_add_gpu._sess.run_with_ortvaluevector( + run_options, ["X"], vect_in, ["Z"], temp_vect_out, devices) + sess_add_gpu._sess.run_with_ortvaluevector( + run_options, ["X"], temp_vect_out, ["Z"], vect_out, devices) + assert len(vect_out) == 1 + return vect_out[0] + + def f_ort_vect_ov_gpu(X): + "ort-vect-ov-gpu" + vect_in = OrtValueVector() + vect_in.push_back(X) + vect_out = OrtValueVector() + # crashes on the next line + sess_add2_gpu._sess.run_with_ortvaluevector( + run_options, ["X"], vect_in, ["Z"], vect_out, devices) + assert len(vect_out) == 1 + return vect_out[0] + + else: + f_ort_vect_ov_eager_gpu = None + f_ort_vect_ov_gpu = None + else: f_ort_ov_eager_gpu = None f_ort_ov_gpu = None + f_ort_vect_ov_eager_gpu = None + f_ort_vect_ov_gpu = None + f_ort_ov_bind_eager_gpu = None + f_ort_ov_bind_gpu = None + + +####################################### +# Let's now check all these functions produces the same results. X = numpy.random.rand(10, CST.shape[1]).astype(CST.dtype) @@ -167,33 +301,59 @@ def f_ort_ov_gpu(X): Xov = C_OrtValue.ortvalue_from_numpy(X, device) Ys = [ - f_numpy(X), - f_ort_eager(X), - f_ort(X), - f_ort_ov_eager(Xov), - f_ort_ov(Xov), + (f_numpy, X), + (f_ort_eager, X), + (f_ort, X), + (f_ort_ov_eager, Xov), + (f_ort_ov, Xov), + (f_ort_ov_bind_eager, Xov), + (f_ort_ov_bind, Xov), ] + +if OrtValueVector is not None: + Ys.extend([ + (f_ort_vect_ov_eager, Xov), + (f_ort_vect_ov, Xov), + ]) + if sess_add_gpu is not None: device_gpu = C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0) try: Xov_gpu = C_OrtValue.ortvalue_from_numpy(X, device_gpu) Ys.extend([ - f_ort_ov_eager_gpu(Xov_gpu), - f_ort_ov_gpu(Xov_gpu), + (f_ort_ov_eager_gpu, Xov_gpu), + (f_ort_ov_gpu, Xov_gpu), + (f_ort_ov_bind_eager_gpu, Xov_gpu), + (f_ort_ov_bind_gpu, Xov_gpu), ]) + if OrtValueVector is not None: + Ys.extend([ + (f_ort_vect_ov_gpu, Xov_gpu), + (f_ort_vect_ov_eager_gpu, Xov_gpu), + ]) except RuntimeError: # cuda is not available sess_add_gpu = None - sess_add2_gpu + sess_add2_gpu = None f_ort_ov_eager_gpu = None f_ort_ov_gpu = None - -for i in range(1, len(Ys)): + f_ort_ov_bind_eager_gpu = None + f_ort_ov_bind_gpu = None + f_ort_vect_ov_eager_gpu = None + f_ort_vect_ov_gpu = None + +results = [] +for fct, x in Ys: + print( + f"check function {fct.__name__!r} and input type {x.__class__.__name__!r}") + results.append(fct(x)) + +for i in range(1, len(results)): try: - assert_allclose(Ys[0], Ys[i]) + assert_allclose(results[0], results[i]) except TypeError: # OrtValue - assert_allclose(Ys[0], Ys[i].numpy()) + assert_allclose(results[0], results[i].numpy()) ########################################## # All outputs are the same. @@ -203,9 +363,15 @@ def f_ort_ov_gpu(X): # +++++++++++++++++++++++ -def benchmark(repeat=100): - fcts = [f_numpy, f_ort_eager, f_ort, f_ort_ov_eager, f_ort_ov, - f_ort_ov_eager_gpu, f_ort_ov_gpu] +def benchmark(repeat=500000): + fcts = [ + f_numpy, f_ort_eager, f_ort, f_ort_ov_eager, f_ort_ov, + f_ort_vect_ov_eager, f_ort_vect_ov, + f_ort_ov_bind_eager, f_ort_ov_bind, + f_ort_ov_eager_gpu, f_ort_ov_gpu, + f_ort_vect_ov_eager_gpu, f_ort_vect_ov_gpu, + f_ort_ov_bind_eager_gpu, f_ort_ov_bind_gpu, + ] data = [] for N in tqdm([1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000]): @@ -216,26 +382,27 @@ def benchmark(repeat=100): device_gpu = C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0) Xov_gpu = C_OrtValue.ortvalue_from_numpy(X, device_gpu) + r = min(500, int(repeat / N)) for f in fcts: if f is None: continue obs = {'name': f.__doc__, "N": N} if "-gpu" in f.__doc__: begin = time.perf_counter() - for r in range(repeat): + for r in range(r): _ = f(Xov_gpu) end = time.perf_counter() - begin elif "-ov" in f.__doc__: begin = time.perf_counter() - for r in range(repeat): + for r in range(r): _ = f(Xov) end = time.perf_counter() - begin else: begin = time.perf_counter() - for r in range(repeat): + for r in range(r): _ = f(X) end = time.perf_counter() - begin - obs['time'] = end / repeat + obs['time'] = end / r data.append(obs) return pandas.DataFrame(data) @@ -251,27 +418,53 @@ def benchmark(repeat=100): # ++++++ def make_graph(df): - fig, ax = plt.subplots(2, 3, figsize=(12, 8)) + + def subgraph(row, cols): + if "numpy" not in cols: + cols.append("numpy") + piv = piv_all[cols].copy() + piv.plot(ax=ax[row, 0], + title="Time execution(s)" if row == 0 else "", + logy=True, logx=True) + piv2 = piv / piv.index.values.reshape((-1, 1)) + piv2.plot(ax=ax[row, 1], + title="Time(s) per execution / N" if row == 0 else "", + logx=True) + piv3 = piv / piv["numpy"].values.reshape((-1, 1)) + piv3.plot(ax=ax[row, 2], + title="Ratio against numpy" if row == 0 else "", + logy=True, logx=True) + for j in range(0, 3): + ax[row, j].legend(fontsize="x-small") + + fig, ax = plt.subplots(5, 3, figsize=(15, 9)) + fig.suptitle("Time execution Eager Add + Add - lower is better") piv_all = df.pivot(index="N", columns="name", values="time") - # no gpu - piv = piv_all[[c for c in piv_all.columns if "gpu" not in c]].copy() - piv.plot(ax=ax[0, 0], title="Time(s) per execution", logy=True, logx=True) - piv2 = piv / piv.index.values.reshape((-1, 1)) - piv2.plot(ax=ax[0, 1], title="Time(s) per execution / N", logx=True) - piv3 = piv / piv["numpy"].values.reshape((-1, 1)) - piv3.plot(ax=ax[0, 2], title="Ratio against numpy (lower is better)", - logy=True, logx=True) - - # ort value - piv = piv_all[[c for c in piv_all.columns if "ov" in c or "numpy" in c]].copy() - piv.plot(ax=ax[1, 0], title="Time(s) per execution", logy=True, logx=True) - piv2 = piv / piv.index.values.reshape((-1, 1)) - piv2.plot(ax=ax[1, 1], title="Time(s) per execution / N", logx=True) - piv3 = piv / piv["numpy"].values.reshape((-1, 1)) - piv3.plot(ax=ax[1, 2], title="Ratio against numpy (lower is better)", - logy=True, logx=True) + # no gpu, no vect, no bind + subgraph(0, [c for c in piv_all.columns + if "-gpu" not in c and "-vect" not in c and "-bind" not in c]) + + # no gpu, ov, no bind + subgraph(1, [c for c in piv_all.columns + if "-gpu" not in c and "-ov" in c and "-bind" not in c]) + + # no gpu, vect or bind + subgraph(2, [c for c in piv_all.columns + if "-gpu" not in c and ("-bind" in c or '-vect' in c)]) + + # gpu, no bind + cols = [c for c in piv_all.columns + if "-gpu" in c and "-ov" in c and "-bind" not in c] + subgraph(3, cols) + + # gpu, vect or bind + cols = [c for c in piv_all.columns + if "-gpu" in c and ("-bind" in c or '-vect' in c)] + subgraph(4, cols) + fig.savefig("eager_mode_cpu.png" if len(cols) == 0 + else "eager_mode_gpu.png", dpi=250) return fig, ax @@ -286,7 +479,16 @@ def make_graph(df): # is using the direct python API. This could be improved by using :epkg:`cython`. # Eager mode must use :epkg:`OrtValue`. It is faster and it reduces the differences # between using two additions in a single graph or two graphs of a single addition -# on CPU. On GPU, it is still faster but eager mode is significantly slower. +# on CPU. On GPU, it is still faster but eager mode is slighly slower with +# method `run_with_ortvaluevector` or `run_with_iobinding`. Both +# methods show similar performances. +# +# However, method `run_with_ort_values` is not recommended +# because the output device cannot be specified. Therefore, +# :epkg:`onnxruntime` requests the output on CPU. On eager mode, +# this output is used again an input for the second call to +# `run_with_ort_values` and the data needs to be copied from CPU +# to GPU. if not has_cuda: print("With GPU") diff --git a/_doc/sphinxdoc/source/tutorials/tutorial_parallel/index.rst b/_doc/sphinxdoc/source/tutorials/tutorial_parallel/index.rst index 57a3f61..9965c36 100644 --- a/_doc/sphinxdoc/source/tutorials/tutorial_parallel/index.rst +++ b/_doc/sphinxdoc/source/tutorials/tutorial_parallel/index.rst @@ -26,19 +26,18 @@ The tutorial was tested with following version: .. runpython:: :showcode: + import sys import numpy import scipy import onnx import onnxruntime import onnxcustom + import sklearn import torch print("python {}".format(sys.version_info)) - mods = [numpy, scipy, sklearn, lightgbm, xgboost, - onnx, onnxmltools, onnxruntime, onnxcustom, - onnxconverter_common, - skl2onnx, mlprodict, pyquickhelper, - torch] + mods = [numpy, scipy, sklearn, onnx, + onnxruntime, onnxcustom, torch] mods = [(m.__name__, m.__version__) for m in mods] mx = max(len(_[0]) for _ in mods) + 1 for name, vers in sorted(mods):