From 42f48e90ef22f2ee4ced6bc981f38f88600201d8 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Fri, 25 Nov 2022 17:03:37 +0100
Subject: [PATCH 1/4] Adds an example to benchmark a kind of eager mode

---
 _doc/examples/plot_benchmark_eager_mode.py    | 220 ++++++++++++++++++
 _doc/examples/plot_benchmark_onnx_function.py |   4 +-
 _doc/examples/plot_benchmark_ort_api.py       |  15 +-
 _doc/sphinxdoc/source/conf.py                 |   6 +-
 .../tutorial_bench/tutorial_benchmark.rst     |   1 +
 5 files changed, 231 insertions(+), 15 deletions(-)
 create mode 100644 _doc/examples/plot_benchmark_eager_mode.py

diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py
new file mode 100644
index 00000000..94811aa5
--- /dev/null
+++ b/_doc/examples/plot_benchmark_eager_mode.py
@@ -0,0 +1,220 @@
+"""
+.. _benchmark-ort-eager-mode:
+
+Benchmark onnxruntime API: eager mode
+=====================================
+
+epkg:`pytorch` or :epkg:`tensorflow` usually work faster if the
+deep learning model is entirely run outside python. The python code
+is only used to build the model but is then used to call the
+execution of the whole. In that configuration, there is no way
+to look into intermediate results.
+
+It does not make it easy to debug or investigate what is going on.
+What the user writes is not what is executed.
+Eager mode is an expression which defines a situation where
+the code which defines the model is the same as the used to
+execute the model. Everything happens in python. It is slower
+but the gap is small if the model manipulate big matrices.
+
+It is possible to do the same with :epkg:`onnxruntime`.
+This example compares the performance of a couple of
+scenarios. This work is close to what is done in example
+:ref:`benchmark-ort-api`.
+
+.. contents::
+    :local:
+
+The scenario
+++++++++++++
+
+We would like to compare two codes. The first one
+executes 2 additions in a single onnx graph. The second
+one executes 10 additions, each of them calling :epkg:`onnxruntime`
+for a single addition.
+
+"""
+import time
+import numpy
+from numpy.testing import assert_allclose
+import pandas
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+from onnx import TensorProto
+from onnx.numpy_helper import from_array
+from onnx.helper import (
+    make_model, make_node, set_model_props, make_tensor,
+    make_graph, make_tensor_value_info)
+from onnxruntime import InferenceSession, __version__ as ort_version
+from onnxruntime.capi._pybind_state import (  # pylint: disable=E0611
+    SessionIOBinding, OrtDevice as C_OrtDevice,
+    OrtMemType, OrtValue as C_OrtValue, RunOptions)
+from cpyquickhelper.numbers.speed_measure import measure_time
+from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
+
+############################################
+# Available optimisation on this machine.
+
+print(code_optimisation())
+repeat = 250
+number = 250
+
+############################################
+# A single addition of a matrix of two dimension.
+
+CST = numpy.array(list(range(100))).reshape(1, -1).astype(numpy.float32)
+X = make_tensor_value_info('X', TensorProto.FLOAT, [None, CST.shape[1]])
+Z = make_tensor_value_info('Z', TensorProto.FLOAT, [None, CST.shape[1]])
+
+graph = make_graph([
+    make_node("Add", ['X', 'Y'], ['Z']),
+], '', [X], [Z], [
+    from_array(CST, name='Y'),
+])
+onnx_add = make_model(graph)
+sess_add = InferenceSession(onnx_add.SerializeToString(),
+                            providers=["CPUExecutionProvider"])
+
+#############################################
+# Two additions of the same matrix.
+
+graph = make_graph([
+    make_node("Add", ['X', 'Y'], ['T']),
+    make_node("Add", ['T', 'Y'], ['Z']),
+], '', [X], [Z], [
+    from_array(CST, 'Y'),
+])
+onnx_add2 = make_model(graph)
+sess_add2 = InferenceSession(onnx_add2.SerializeToString(),
+                             providers=["CPUExecutionProvider"])
+
+############################################
+# The functions to test
+# +++++++++++++++++++++
+#
+# * `numpy`: :epkg:`numpy`
+# * `ort`: :epkg:`onnxruntime` + numpy array as input
+# * `ort-ov`: :epkg:`onnxruntime` + :epkg:`C_OrtValue` as input
+
+
+def f_numpy(X):
+    "numpy"
+    T = X + CST
+    Z = T + CST
+    return Z
+
+
+def f_ort_eager(X):
+    "ort-eager"
+    T = sess_add._sess.run(['Z'], {'X': X}, None)[0]
+    Z = sess_add._sess.run(['Z'], {'X': T}, None)[0]
+    return Z
+
+
+def f_ort(X):
+    "ort"
+    Z = sess_add2._sess.run(['Z'], {'X': X}, None)[0]
+    return Z
+
+
+def f_ort_ov_eager(X):
+    "ort-ov-eager"
+    T = sess_add._sess.run_with_ort_values({'X': X}, ['Z'], None)[0]
+    Z = sess_add._sess.run_with_ort_values({'X': T}, ['Z'], None)[0]
+    return Z
+
+
+def f_ort_ov(X):
+    "ort-ov"
+    Z = sess_add2._sess.run_with_ort_values({'X': X}, ['Z'], None)[0]
+    return Z
+
+
+X = numpy.random.rand(10, CST.shape[1]).astype(CST.dtype)
+
+device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0)
+Xov = C_OrtValue.ortvalue_from_numpy(X, device)
+
+Ys = [
+    f_numpy(X),
+    f_ort_eager(X),
+    f_ort(X),
+    f_ort_ov_eager(Xov),
+    f_ort_ov(Xov),
+]
+
+for i in range(1, len(Ys)):
+    try:
+        assert_allclose(Ys[0], Ys[i])
+    except TypeError:
+        # OrtValue
+        assert_allclose(Ys[0], Ys[i].numpy())
+
+##########################################
+# All outputs are the same.
+
+##############################
+# Benchmark the functions
+# +++++++++++++++++++++++
+
+
+def benchmark(repeat=100):
+    fcts = [f_numpy, f_ort_eager, f_ort, f_ort_ov_eager, f_ort_ov]
+    data = []
+    for N in tqdm([1, 2, 5, 10, 20, 50, 100, 200, 500,
+                   1000, 2000, 5000, 10000, 20000]):
+        X = numpy.random.rand(N, CST.shape[1]).astype(CST.dtype)
+        device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0)
+        Xov = C_OrtValue.ortvalue_from_numpy(X, device)
+
+        for f in fcts:
+            obs = {'name': f.__doc__, "N": N}
+            if "-ov" in f.__doc__:
+                begin = time.perf_counter()
+                for r in range(repeat):
+                    _ = f(Xov)
+                end = time.perf_counter() - begin
+            else:
+                begin = time.perf_counter()
+                for r in range(repeat):
+                    _ = f(X)
+                end = time.perf_counter() - begin
+            obs['time'] = end / repeat
+            data.append(obs)
+
+    return pandas.DataFrame(data)
+
+
+df = benchmark()
+df
+
+
+########################################
+# Graphs
+# ++++++
+
+fig, ax = plt.subplots(1, 3, figsize=(12, 4))
+
+piv = df.pivot(index="N", columns="name", values="time")
+piv.plot(ax=ax[0], title="Time(s) per execution", logy=True, logx=True)
+piv2 = piv / piv.index.values.reshape((-1, 1))
+piv2.plot(ax=ax[1], title="Time(s) per execution / N", logx=True)
+piv3 = piv / piv["numpy"].values.reshape((-1, 1))
+piv3.plot(ax=ax[2], title="Ratio against numpy (lower is better)",
+          logy=True, logx=True)
+
+
+###################################
+# Conclusion
+# ++++++++++
+#
+# The eager mode is slower than numpy for small arrays than is faster.
+# This is probably due to :epkg:`pybind11` binding when numpy
+# is using the direct python API. This could be improved by using :epkg:`cython`.
+# Eager mode must use :epkg:`OrtValue`. It is faster and it reduces the differences
+# between using two additions in a single graph or two graphs of a single addition.
+
+print(f"onnxruntime.__version__ = {ort_version!r}")
+
+
+plt.show()
diff --git a/_doc/examples/plot_benchmark_onnx_function.py b/_doc/examples/plot_benchmark_onnx_function.py
index 64912233..a38c8096 100644
--- a/_doc/examples/plot_benchmark_onnx_function.py
+++ b/_doc/examples/plot_benchmark_onnx_function.py
@@ -19,7 +19,7 @@
   method `onnxruntime.InferenceSession.run`
 * `bind`: inference through an ONNX graph executed with
   method `onnxruntime.InferenceSession.run_with_iobinding`
-* `run`: inference through an ONNX graph executed with
+* `inplace`: inference through an ONNX graph executed with
   method `onnxruntime.InferenceSession.run_with_iobinding`
   but without counting the binding assuming input buffers
   are reused and do not need binding again
@@ -114,7 +114,7 @@ def benchmark(name, onx, fct_numpy, *args,
 
         ms = measure_time(
             lambda: nobind_just_run(sess._sess, bind))
-        ms.update(dict(name=name, impl='run', dim=dim))
+        ms.update(dict(name=name, impl='inplace', dim=dim))
         rows.append(ms)
 
     return rows
diff --git a/_doc/examples/plot_benchmark_ort_api.py b/_doc/examples/plot_benchmark_ort_api.py
index 3e35d8e8..3bc234d0 100644
--- a/_doc/examples/plot_benchmark_ort_api.py
+++ b/_doc/examples/plot_benchmark_ort_api.py
@@ -1,8 +1,8 @@
 """
 .. _benchmark-ort-api:
 
-Benchmark onnxruntime API: run or ...
-=====================================
+Benchmark onnxruntime API: run or run_with_ort_values
+=====================================================
 
 This short code compares different methods to call onnxruntime API.
 
@@ -17,7 +17,6 @@
     py-spy record -o plot_benchmark_ort_api.svg -r 10
     --native -- python plot_benchmark_ort_api.py
 
-
 .. contents::
     :local:
 
@@ -101,12 +100,12 @@
 obs = measure_time(lambda: sess.run(None, {'X': X}),
                    context=dict(sess=sess, X=X),
                    repeat=repeat, number=number)
-obs['name'] = 'ort-run'
+obs['name'] = 'ort'
 data.append(obs)
 
 
 ###################################
-# onnxruntime: run
+# onnxruntime: run from C API
 print('ort-c')
 sess = InferenceSession(onx.SerializeToString(),
                         providers=['CPUExecutionProvider'])
@@ -121,7 +120,7 @@
 
 
 ###################################
-# onnxruntime: run_with_ort_values
+# onnxruntime: run_with_ort_values from C API
 print('ort-ov-c')
 device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0)
 
@@ -136,12 +135,12 @@
         {'X': Xov}, output_names, ro),
     context=dict(sess=sess),
     repeat=repeat, number=number)
-obs['name'] = 'ort-ov'
+obs['name'] = 'ort-ov-c'
 data.append(obs)
 
 
 ###################################
-# onnxruntime: run_with_iobinding
+# onnxruntime: run_with_iobinding from C API
 print('ort-bind')
 sess = InferenceSession(onx.SerializeToString(),
                         providers=['CPUExecutionProvider'])
diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/sphinxdoc/source/conf.py
index 1f1e7268..76043281 100644
--- a/_doc/sphinxdoc/source/conf.py
+++ b/_doc/sphinxdoc/source/conf.py
@@ -26,7 +26,7 @@ def callback_begin():
         os.makedirs(dest)
     for img in os.listdir(source):
         ext = os.path.splitext(img)[-1]
-        if ext not in {'.png', '.jpg'}:
+        if ext not in {'.png', '.jpg', '.svg'}:
             continue
         shutil.copy(os.path.join(source, img), dest)
 
@@ -54,11 +54,7 @@ def callback_begin():
 }
 
 blog_root = "http://www.xavierdupre.fr/app/onnxcustom/helpsphinx/"
-
-html_css_files = ['my-styles.css']
-
 html_logo = "phdoc_static/project_ico.png"
-html_sidebars = {}
 language = "en"
 onnx_doc_folder = os.path.join(os.path.dirname(__file__), 'api', 'onnxops')
 mathdef_link_only = True
diff --git a/_doc/sphinxdoc/source/tutorials/tutorial_bench/tutorial_benchmark.rst b/_doc/sphinxdoc/source/tutorials/tutorial_bench/tutorial_benchmark.rst
index 196442e8..cf7976fa 100644
--- a/_doc/sphinxdoc/source/tutorials/tutorial_bench/tutorial_benchmark.rst
+++ b/_doc/sphinxdoc/source/tutorials/tutorial_bench/tutorial_benchmark.rst
@@ -8,4 +8,5 @@ Inference
     ../../gyexamples/plot_benchmark_ort_api
     ../../gyexamples/plot_benchmark_inference_standard
     ../../gyexamples/plot_benchmark_inference
+    ../../gyexamples/plot_benchmark_eager_mode
     ../../gyexamples/plot_benchmark_graph_opt

From 6b87d82190b14a737b36f388d07529e92d751284 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Sun, 27 Nov 2022 22:34:06 +0100
Subject: [PATCH 2/4] lint

---
 _doc/examples/plot_benchmark_eager_mode.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py
index 94811aa5..aecb1e2c 100644
--- a/_doc/examples/plot_benchmark_eager_mode.py
+++ b/_doc/examples/plot_benchmark_eager_mode.py
@@ -43,13 +43,12 @@
 from onnx import TensorProto
 from onnx.numpy_helper import from_array
 from onnx.helper import (
-    make_model, make_node, set_model_props, make_tensor,
+    make_model, make_node,
     make_graph, make_tensor_value_info)
 from onnxruntime import InferenceSession, __version__ as ort_version
 from onnxruntime.capi._pybind_state import (  # pylint: disable=E0611
-    SessionIOBinding, OrtDevice as C_OrtDevice,
-    OrtMemType, OrtValue as C_OrtValue, RunOptions)
-from cpyquickhelper.numbers.speed_measure import measure_time
+    OrtDevice as C_OrtDevice,
+    OrtMemType, OrtValue as C_OrtValue)
 from mlprodict.testing.experimental_c_impl.experimental_c import code_optimisation
 
 ############################################

From 693c598489969608a053b8a0d4b860f5dd3e05cd Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Mon, 28 Nov 2022 13:25:32 +0000
Subject: [PATCH 3/4] add gpu

---
 _doc/examples/data/eager_mode.csv          |  99 +++++++++++++++++++
 _doc/examples/plot_benchmark_eager_mode.py | 107 ++++++++++++++++++---
 2 files changed, 191 insertions(+), 15 deletions(-)
 create mode 100644 _doc/examples/data/eager_mode.csv

diff --git a/_doc/examples/data/eager_mode.csv b/_doc/examples/data/eager_mode.csv
new file mode 100644
index 00000000..e953883c
--- /dev/null
+++ b/_doc/examples/data/eager_mode.csv
@@ -0,0 +1,99 @@
+name,N,time
+numpy,1,1.6239401884377004e-06
+ort-eager,1,1.2923539616167545e-05
+ort,1,7.524730172008276e-06
+ort-ov-eager,1,1.0392629774287342e-05
+ort-ov,1,6.243779789656401e-06
+ort-ov-eager-gpu,1,6.998750963248313e-05
+ort-ov-gpu,1,3.477875958196819e-05
+numpy,2,3.044890472665429e-06
+ort-eager,2,1.3051539426669478e-05
+ort,2,8.713690331205726e-06
+ort-ov-eager,2,1.1555589735507964e-05
+ort-ov,2,7.138750515878201e-06
+ort-ov-eager-gpu,2,7.208543014712632e-05
+ort-ov-gpu,2,3.6394710186868905e-05
+numpy,5,3.831860376521945e-06
+ort-eager,5,1.4171500224620103e-05
+ort,5,8.038709638640285e-06
+ort-ov-eager,5,1.1525589507073164e-05
+ort-ov,5,7.319740252569317e-06
+ort-ov-eager-gpu,5,7.189344032667577e-05
+ort-ov-gpu,5,3.57317307498306e-05
+numpy,10,4.102849634364247e-06
+ort-eager,10,1.4772480353713036e-05
+ort,10,8.412699680775403e-06
+ort-ov-eager,10,1.1969569604843856e-05
+ort-ov,10,7.66773009672761e-06
+ort-ov-eager-gpu,10,7.502933032810688e-05
+ort-ov-gpu,10,3.6292700096964834e-05
+numpy,20,5.108820041641593e-06
+ort-eager,20,1.6318419948220253e-05
+ort,20,9.663649834692478e-06
+ort-ov-eager,20,1.3151530874893069e-05
+ort-ov,20,8.747689425945282e-06
+ort-ov-eager-gpu,20,7.586929947137832e-05
+ort-ov-gpu,20,3.6894690711051223e-05
+numpy,50,8.849690202623605e-06
+ort-eager,50,2.1720220101997255e-05
+ort,50,1.3897509779781103e-05
+ort-ov-eager,50,1.6557410126551987e-05
+ort-ov,50,1.1749580735340715e-05
+ort-ov-eager-gpu,50,8.735888986848295e-05
+ort-ov-gpu,50,3.979059052653611e-05
+numpy,100,1.4907469740137458e-05
+ort-eager,100,2.8627979336306453e-05
+ort,100,2.0100290421396493e-05
+ort-ov-eager,100,2.2381199523806573e-05
+ort-ov,100,1.7124389996752145e-05
+ort-ov-eager-gpu,100,9.972644969820977e-05
+ort-ov-gpu,100,4.341945983469486e-05
+numpy,200,2.7075030375272035e-05
+ort-eager,200,4.3370459461584684e-05
+ort,200,3.119189059361815e-05
+ort-ov-eager,200,3.3285809913650154e-05
+ort-ov,200,2.6379060000181197e-05
+ort-ov-eager-gpu,200,0.0001272184809204191
+ort-ov-gpu,200,5.1265170332044365e-05
+numpy,500,6.265176925808191e-05
+ort-eager,500,8.758387994021178e-05
+ort,500,6.585365976206958e-05
+ort-ov-eager,500,5.8810909977182745e-05
+ort-ov,500,5.206315079703927e-05
+ort-ov-eager-gpu,500,0.0001980439608450979
+ort-ov-gpu,500,7.229942944832146e-05
+numpy,1000,0.00012052271980792284
+ort-eager,1000,0.00020487471017986537
+ort,1000,0.00010618122993037104
+ort-ov-eager,1000,7.917118025943637e-05
+ort-ov,1000,9.659656090661884e-05
+ort-ov-eager-gpu,1000,0.0003143318207003176
+ort-ov-gpu,1000,0.00010537925059907138
+numpy,2000,0.0002533179905731231
+ort-eager,2000,0.00042499787989072504
+ort,2000,0.0003522354701999575
+ort-ov-eager,2000,0.0001527195703238249
+ort-ov,2000,0.0001286304194945842
+ort-ov-eager-gpu,2000,0.0005476115201599896
+ort-ov-gpu,2000,0.00017218387103639542
+numpy,5000,0.0006107362709008157
+ort-eager,5000,0.0007773243507836014
+ort,5000,0.0005611650308128447
+ort-ov-eager,5000,9.656556067056954e-05
+ort-ov,5000,0.000121245690388605
+ort-ov-eager-gpu,5000,0.0012037401704583317
+ort-ov-gpu,5000,0.00034049289068207146
+numpy,10000,0.0011586127802729607
+ort-eager,10000,0.0016428445605561138
+ort,10000,0.0008303114597219974
+ort-ov-eager,10000,0.0003013462794478983
+ort-ov,10000,0.0004336395696736872
+ort-ov-eager-gpu,10000,0.0021700217993929983
+ort-ov-gpu,10000,0.0007736664800904691
+numpy,20000,0.0026089051889721304
+ort-eager,20000,0.003720655629877001
+ort,20000,0.002414294109912589
+ort-ov-eager,20000,0.0006073223997373134
+ort-ov,20000,0.0005382718495093287
+ort-ov-eager-gpu,20000,0.004377091269707307
+ort-ov-gpu,20000,0.0014389527996536344
diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py
index aecb1e2c..92196e76 100644
--- a/_doc/examples/plot_benchmark_eager_mode.py
+++ b/_doc/examples/plot_benchmark_eager_mode.py
@@ -45,7 +45,8 @@
 from onnx.helper import (
     make_model, make_node,
     make_graph, make_tensor_value_info)
-from onnxruntime import InferenceSession, __version__ as ort_version
+from onnxruntime import (
+    get_all_providers, InferenceSession, __version__ as ort_version)
 from onnxruntime.capi._pybind_state import (  # pylint: disable=E0611
     OrtDevice as C_OrtDevice,
     OrtMemType, OrtValue as C_OrtValue)
@@ -87,6 +88,20 @@
 sess_add2 = InferenceSession(onnx_add2.SerializeToString(),
                              providers=["CPUExecutionProvider"])
 
+############################################
+# Let's consider GPU as well.
+
+has_cuda = "CUDAExecutionProvider" in get_all_providers()
+if has_cuda:
+    sess_add_gpu = InferenceSession(onnx_add.SerializeToString(),
+                                    providers=["CUDAExecutionProvider"])
+    sess_add2_gpu = InferenceSession(onnx_add2.SerializeToString(),
+                                     providers=["CUDAExecutionProvider"])
+else:
+    print("No GPU or one GPU was detected.")
+    sess_add_gpu = None
+    sess_add2_gpu = None
+
 ############################################
 # The functions to test
 # +++++++++++++++++++++
@@ -129,6 +144,24 @@ def f_ort_ov(X):
     return Z
 
 
+if sess_add_gpu is not None:
+
+    def f_ort_ov_eager_gpu(X):
+        "ort-ov-eager-gpu"
+        T = sess_add_gpu._sess.run_with_ort_values({'X': X}, ['Z'], None)[0]
+        Z = sess_add_gpu._sess.run_with_ort_values({'X': T}, ['Z'], None)[0]
+        return Z
+
+
+    def f_ort_ov_gpu(X):
+        "ort-ov-gpu"
+        Z = sess_add2_gpu._sess.run_with_ort_values({'X': X}, ['Z'], None)[0]
+        return Z
+
+else:
+    f_ort_ov_eager_gpu = None
+    f_ort_ov_gpu = None
+
 X = numpy.random.rand(10, CST.shape[1]).astype(CST.dtype)
 
 device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0)
@@ -141,6 +174,13 @@ def f_ort_ov(X):
     f_ort_ov_eager(Xov),
     f_ort_ov(Xov),
 ]
+if sess_add_gpu is not None:
+    device_gpu = C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0)
+    Xov_gpu = C_OrtValue.ortvalue_from_numpy(X, device_gpu)
+    Ys.extend([
+        f_ort_ov_eager_gpu(Xov_gpu),
+        f_ort_ov_gpu(Xov_gpu),
+    ])
 
 for i in range(1, len(Ys)):
     try:
@@ -158,17 +198,28 @@ def f_ort_ov(X):
 
 
 def benchmark(repeat=100):
-    fcts = [f_numpy, f_ort_eager, f_ort, f_ort_ov_eager, f_ort_ov]
+    fcts = [f_numpy, f_ort_eager, f_ort, f_ort_ov_eager, f_ort_ov,
+            f_ort_ov_eager_gpu, f_ort_ov_gpu]
     data = []
     for N in tqdm([1, 2, 5, 10, 20, 50, 100, 200, 500,
                    1000, 2000, 5000, 10000, 20000]):
         X = numpy.random.rand(N, CST.shape[1]).astype(CST.dtype)
         device = C_OrtDevice(C_OrtDevice.cpu(), OrtMemType.DEFAULT, 0)
         Xov = C_OrtValue.ortvalue_from_numpy(X, device)
+        if f_ort_ov_gpu is not None:
+            device_gpu = C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0)
+            Xov_gpu = C_OrtValue.ortvalue_from_numpy(X, device_gpu)
 
         for f in fcts:
+            if f is None:
+                continue
             obs = {'name': f.__doc__, "N": N}
-            if "-ov" in f.__doc__:
+            if "-gpu" in f.__doc__:
+                begin = time.perf_counter()
+                for r in range(repeat):
+                    _ = f(Xov_gpu)
+                end = time.perf_counter() - begin
+            elif "-ov" in f.__doc__:
                 begin = time.perf_counter()
                 for r in range(repeat):
                     _ = f(Xov)
@@ -185,6 +236,7 @@ def benchmark(repeat=100):
 
 
 df = benchmark()
+df.to_csv("plot_benchmark_eager_mode.csv", index=False)
 df
 
 
@@ -192,28 +244,53 @@ def benchmark(repeat=100):
 # Graphs
 # ++++++
 
-fig, ax = plt.subplots(1, 3, figsize=(12, 4))
+def make_graph(df):
+    fig, ax = plt.subplots(2, 3, figsize=(12, 8))
+
+    piv_all = df.pivot(index="N", columns="name", values="time")
+
+    # no gpu
+    piv = piv_all[[c for c in piv_all.columns if "gpu" not in c]].copy()
+    piv.plot(ax=ax[0, 0], title="Time(s) per execution", logy=True, logx=True)
+    piv2 = piv / piv.index.values.reshape((-1, 1))
+    piv2.plot(ax=ax[0, 1], title="Time(s) per execution / N", logx=True)
+    piv3 = piv / piv["numpy"].values.reshape((-1, 1))
+    piv3.plot(ax=ax[0, 2], title="Ratio against numpy (lower is better)",
+            logy=True, logx=True)
 
-piv = df.pivot(index="N", columns="name", values="time")
-piv.plot(ax=ax[0], title="Time(s) per execution", logy=True, logx=True)
-piv2 = piv / piv.index.values.reshape((-1, 1))
-piv2.plot(ax=ax[1], title="Time(s) per execution / N", logx=True)
-piv3 = piv / piv["numpy"].values.reshape((-1, 1))
-piv3.plot(ax=ax[2], title="Ratio against numpy (lower is better)",
-          logy=True, logx=True)
+    # ort value
+    piv = piv_all[[c for c in piv_all.columns if "ov" in c or "numpy" in c]].copy()
+    piv.plot(ax=ax[1, 0], title="Time(s) per execution", logy=True, logx=True)
+    piv2 = piv / piv.index.values.reshape((-1, 1))
+    piv2.plot(ax=ax[1, 1], title="Time(s) per execution / N", logx=True)
+    piv3 = piv / piv["numpy"].values.reshape((-1, 1))
+    piv3.plot(ax=ax[1, 2], title="Ratio against numpy (lower is better)",
+            logy=True, logx=True)
+    return fig, ax
 
 
+fig, ax = make_graph(df)
+
 ###################################
 # Conclusion
 # ++++++++++
 #
-# The eager mode is slower than numpy for small arrays than is faster.
+# The eager mode is slower than numpy for small arrays then is faster.
 # This is probably due to :epkg:`pybind11` binding when numpy
 # is using the direct python API. This could be improved by using :epkg:`cython`.
 # Eager mode must use :epkg:`OrtValue`. It is faster and it reduces the differences
-# between using two additions in a single graph or two graphs of a single addition.
+# between using two additions in a single graph or two graphs of a single addition
+# on CPU. On GPU, it is still faster but eager mode is significantly slower.
 
-print(f"onnxruntime.__version__ = {ort_version!r}")
+if not has_cuda:
+    print("With GPU")
+    df = pandas.read_csv("data/eager_mode.csv")
+    _, ax = make_graph(df)
+else:
+    ax = None
+ax
 
+print(f"onnxruntime.__version__ = {ort_version!r}")
 
-plt.show()
+fig.savefig("eager.png")
+# plt.show()

From 2780900be4a18bb36fbee4d74d909871e5def8c7 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Mon, 28 Nov 2022 14:35:42 +0100
Subject: [PATCH 4/4] Update plot_benchmark_eager_mode.py

---
 _doc/examples/plot_benchmark_eager_mode.py | 26 ++++++++++++++--------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/_doc/examples/plot_benchmark_eager_mode.py b/_doc/examples/plot_benchmark_eager_mode.py
index 92196e76..ab011367 100644
--- a/_doc/examples/plot_benchmark_eager_mode.py
+++ b/_doc/examples/plot_benchmark_eager_mode.py
@@ -152,7 +152,6 @@ def f_ort_ov_eager_gpu(X):
         Z = sess_add_gpu._sess.run_with_ort_values({'X': T}, ['Z'], None)[0]
         return Z
 
-
     def f_ort_ov_gpu(X):
         "ort-ov-gpu"
         Z = sess_add2_gpu._sess.run_with_ort_values({'X': X}, ['Z'], None)[0]
@@ -176,11 +175,18 @@ def f_ort_ov_gpu(X):
 ]
 if sess_add_gpu is not None:
     device_gpu = C_OrtDevice(C_OrtDevice.cuda(), OrtMemType.DEFAULT, 0)
-    Xov_gpu = C_OrtValue.ortvalue_from_numpy(X, device_gpu)
-    Ys.extend([
-        f_ort_ov_eager_gpu(Xov_gpu),
-        f_ort_ov_gpu(Xov_gpu),
-    ])
+    try:
+        Xov_gpu = C_OrtValue.ortvalue_from_numpy(X, device_gpu)
+        Ys.extend([
+            f_ort_ov_eager_gpu(Xov_gpu),
+            f_ort_ov_gpu(Xov_gpu),
+        ])
+    except RuntimeError:
+        # cuda is not available
+        sess_add_gpu = None
+        sess_add2_gpu
+        f_ort_ov_eager_gpu = None
+        f_ort_ov_gpu = None
 
 for i in range(1, len(Ys)):
     try:
@@ -256,7 +262,7 @@ def make_graph(df):
     piv2.plot(ax=ax[0, 1], title="Time(s) per execution / N", logx=True)
     piv3 = piv / piv["numpy"].values.reshape((-1, 1))
     piv3.plot(ax=ax[0, 2], title="Ratio against numpy (lower is better)",
-            logy=True, logx=True)
+              logy=True, logx=True)
 
     # ort value
     piv = piv_all[[c for c in piv_all.columns if "ov" in c or "numpy" in c]].copy()
@@ -265,7 +271,7 @@ def make_graph(df):
     piv2.plot(ax=ax[1, 1], title="Time(s) per execution / N", logx=True)
     piv3 = piv / piv["numpy"].values.reshape((-1, 1))
     piv3.plot(ax=ax[1, 2], title="Ratio against numpy (lower is better)",
-            logy=True, logx=True)
+              logy=True, logx=True)
     return fig, ax
 
 
@@ -290,7 +296,9 @@ def make_graph(df):
     ax = None
 ax
 
+######################################
+# Results obtained with the following version.
+
 print(f"onnxruntime.__version__ = {ort_version!r}")
 
-fig.savefig("eager.png")
 # plt.show()