From aa15268e440c60c6d15b30e521b8c19ffa05f88b Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Fri, 11 Nov 2022 10:47:53 +0100
Subject: [PATCH 01/16] lint + example

---
 _doc/examples/plot_parallel_execution.py      | 62 +++++++++++++++++++
 _doc/sphinxdoc/source/conf.py                 |  1 +
 .../test_documentation_examples_lightgbm.py   |  6 +-
 3 files changed, 66 insertions(+), 3 deletions(-)
 create mode 100644 _doc/examples/plot_parallel_execution.py

diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
new file mode 100644
index 00000000..2a2372a0
--- /dev/null
+++ b/_doc/examples/plot_parallel_execution.py
@@ -0,0 +1,62 @@
+"""
+.. _l-plot-parallel-execution:
+
+Multithreading with onnxruntime
+===============================
+
+.. index:: thread, parallel
+
+Python implements multithreading but it is not in practice due to the GIL
+(see :epkg:`Le GIL`). However, if most of the parallelized code is not creating
+python object, this option becomes more interesting than creating several processes
+trying to exchange data through sockets. :epkg:`onnxruntime` falls into that category.
+For a big model such as a deeplearning model, this might be interesting.
+This example verifies this scenario.
+
+.. contents::
+    :local:
+
+A model
++++++++
+
+Let's retrieve a not so big model.
+"""
+import os
+import urllib.request
+import numpy
+from onnxruntime import InferenceSession
+
+
+def download_file(url, name, min_size):
+    if not os.path.exists(name):
+        print(f"download '{url}'")
+        with urllib.request.urlopen(url) as u:
+            content = u.read()
+        if len(content) < min_size:
+            raise RuntimeError(
+                f"Unable to download '{url}' due to\n{content}")
+        print(f"downloaded {len(content)} bytes.")
+        with open(name, "wb") as f:
+            f.write(content)
+    else:
+        print(f"'{name}' already downloaded")
+
+
+model_name = "squeezenet1.1-7.onnx"
+url_name = ("https://github.com/onnx/models/raw/main/vision/"
+            "classification/squeezenet/model")
+url_name += "/" + model_name
+download_file(url_name, model_name, 100000)
+
+#############################################
+# Measuring inference time
+# ++++++++++++++++++++++++
+#
+# Let's create a random image.
+
+sess = InferenceSession(model_name)
+for i in sess.get_inputs():
+    print(f"input {i}, name={i.name!r}, type={i.type}, shape={i.shape}")
+
+
+rnd_img = numpy.random.rand((1, 3, 224, 224)).astype(numpy.float32)
diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/sphinxdoc/source/conf.py
index e0f978ef..1f1e7268 100644
--- a/_doc/sphinxdoc/source/conf.py
+++ b/_doc/sphinxdoc/source/conf.py
@@ -129,6 +129,7 @@ def callback_begin():
     'DLPack': 'https://github.com/dmlc/dlpack',
     'docker': 'https://en.wikipedia.org/wiki/Docker_(software)',
     'DOT': 'https://www.graphviz.org/doc/info/lang.html',
+    'Le GIL': 'http://www.xavierdupre.fr/app/teachpyx/helpsphinx/notebooks/gil_example.html',
     'ImageNet': 'http://www.image-net.org/',
     'LightGBM': 'https://lightgbm.readthedocs.io/en/latest/',
     'lightgbm': 'https://lightgbm.readthedocs.io/en/latest/',
diff --git a/_unittests/ut_documentation/test_documentation_examples_lightgbm.py b/_unittests/ut_documentation/test_documentation_examples_lightgbm.py
index 279b43f0..ebbe7445 100644
--- a/_unittests/ut_documentation/test_documentation_examples_lightgbm.py
+++ b/_unittests/ut_documentation/test_documentation_examples_lightgbm.py
@@ -2,7 +2,6 @@
 @brief      test log(time=60s)
 """
 import unittest
-from distutils.version import StrictVersion
 import os
 import sys
 import importlib
@@ -10,6 +9,7 @@
 from datetime import datetime
 import onnxruntime
 from pyquickhelper.pycode import ExtTestCase, skipif_appveyor
+from pyquickhelper.texthelper.version_helper import compare_module_version
 
 
 def import_source(module_file_path, module_name):
@@ -60,8 +60,8 @@ def test_documentation_examples_lightgbm(self):
                             name))
                     continue
             if (name == "plot_pipeline_lightgbm.py" and
-                    StrictVersion(onnxruntime.__version__) <
-                        StrictVersion('1.0.0')):
+                    compare_module_version(
+                        onnxruntime.__version__, '1.0.0') < 0):
                 continue
             if not name.startswith("plot_") or not name.endswith(".py"):
                 continue

From 2754bcb048ad09f07ae0bae5e577b4f9df742fc0 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Fri, 11 Nov 2022 13:12:41 +0100
Subject: [PATCH 02/16] complete example

---
 .circleci/config.yml                     |   2 +-
 _doc/examples/plot_parallel_execution.py | 146 ++++++++++++++++++++++-
 onnxcustom/utils/onnx_function.py        |   4 +-
 3 files changed, 144 insertions(+), 8 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index e44ec247..80b1ecaf 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2,7 +2,7 @@ version: 2
 jobs:
   build:
     docker:
-      - image: cimg/python:3.10.5
+      - image: cimg/python:3.9.5
     
     working_directory: ~/repo
     
diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index 2a2372a0..9356ff7a 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -21,9 +21,15 @@
 
 Let's retrieve a not so big model.
 """
+import multiprocessing
 import os
 import urllib.request
+import threading
+import time
+import tqdm
 import numpy
+import pandas
+from cpyquickhelper.numbers import measure_time
 from onnxruntime import InferenceSession
 
 
@@ -42,9 +48,15 @@ def download_file(url, name, min_size):
         print(f"'{name}' already downloaded")
 
 
-model_name = "squeezenet1.1-7.onnx"
-url_name = ("https://github.com/onnx/models/raw/main/vision/"
-            "classification/squeezenet/model")
+small = True
+if small:
+    model_name = "mobilenetv2-10.onnx"
+    url_name = ("https://github.com/onnx/models/raw/main/vision/"
+                "classification/mobilenet/model")
+else:
+    model_name = "resnet18-v1-7.onnx"
+    url_name = ("https://github.com/onnx/models/raw/main/vision/"
+                "classification/resnet/model")
 url_name += "/" + model_name
 download_file(url_name, model_name, 100000)
 
@@ -54,9 +66,133 @@ def download_file(url, name, min_size):
 #
 # Let's create a random image.
 
-sess = InferenceSession(model_name)
+sess = InferenceSession(model_name, providers=["CPUExecutionProvider"])
 for i in sess.get_inputs():
     print(f"input {i}, name={i.name!r}, type={i.type}, shape={i.shape}")
+    input_name = i.name
+    input_shape = list(i.shape)
+    if input_shape[0] in [None, "batch_size"]:
+        input_shape[0] = 1
 
 
-rnd_img = numpy.random.rand((1, 3, 224, 224)).astype(numpy.float32)
+rnd_img = numpy.random.rand(*input_shape).astype(numpy.float32)
+
+res = sess.run(None, {input_name: rnd_img})
+print(f"output: type={res[0].dtype}, shape={res[0].shape}")
+
+print(measure_time(lambda: sess.run(None, {input_name: rnd_img}),
+                   div_by_number=True, repeat=10, number=10))
+
+#############################################
+# Parallelization
+# +++++++++++++++
+#
+# We define the number of threads as the number of cores divided by 2.
+# This is a dummy value. It should let a core to handle the main program.
+
+n_threads = multiprocessing.cpu_count() - 1
+print(f"n_threads={n_threads}")
+
+
+imgs = [numpy.random.rand(*input_shape).astype(numpy.float32)
+        for i in range(n_threads)]
+
+sesss = [InferenceSession(model_name, providers=["CPUExecutionProvider"])
+         for i in range(n_threads)]
+
+################################
+# First: sequence
+
+def sequence(N=1):
+    res = []
+    for sess, img in zip(sesss, imgs):
+        for i in range(N):
+            res.append(sess.run(None, {input_name: img})[0])
+    return res
+
+print(measure_time(sequence, div_by_number=True, repeat=2, number=2))
+
+#################################
+# Second: multitheading
+
+class MyThread(threading.Thread):
+
+    def __init__(self, sess, imgs):
+        threading.Thread.__init__(self)
+        self.sess = sess
+        self.imgs = imgs
+        self.q = []
+
+    def run(self):
+        for img in self.imgs:
+            r = sess.run(None, {input_name: img})[0]
+            self.q.append(r)
+
+
+def parallel(N=1):
+    threads = [MyThread(sess, [img] * N)
+               for sess, img in zip(sesss, imgs)]
+    for t in threads:
+        t.start()
+    res = []
+    for t in threads:
+        t.join()
+        res.extend(t.q)
+    return res
+
+print(measure_time(parallel, div_by_number=True, repeat=2, number=2))
+
+###################################
+# It is worse for one image. Let's increase the number of images to parallelize.
+
+r_seq = sequence(4)
+if len(r_seq) != n_threads * 4:
+    raise ValueError(
+        f"Unexpected number of results {len(r_seq)} != {n_threads * 4}.")
+r_par = parallel(4) 
+if len(r_par) != n_threads * 4:
+    raise ValueError(
+        f"Unexpected number of results {len(r_par)} != {n_threads * 4}.")
+
+print(measure_time(lambda: sequence(4), div_by_number=True, repeat=2, number=2))
+print(measure_time(lambda: parallel(4), div_by_number=True, repeat=2, number=2))
+
+####################################
+# Let's increase again.
+
+data = []
+for N in tqdm.tqdm(range(1, 21)):
+    begin = time.perf_counter()
+    res1 = sequence(N)
+    end = time.perf_counter() - begin
+    obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end)
+
+    begin = time.perf_counter()
+    res2 = parallel(N)
+    end = time.perf_counter() - begin
+    obs.update(dict(n_imgs_par=len(res2), time_par=end))
+
+    data.append(obs)
+
+df = pandas.DataFrame(data)
+df
+
+##########################################
+# Plotting
+# ++++++++
+
+df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"]
+df["time_par_img"] = df["time_par"] / df["n_imgs_par"]
+
+ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot(
+    title="Time per image / batch size")
+ax.set_xlabel("batch size")
+ax.set_ylabel("s")
+
+#######################################
+# It improves but not as much as expected. The number of interactions
+# with python is still too high.
+
+# import matplotlib.pyplot as plt
+# plt.show()
+
diff --git a/onnxcustom/utils/onnx_function.py b/onnxcustom/utils/onnx_function.py
index 825b39be..5ae13fcd 100644
--- a/onnxcustom/utils/onnx_function.py
+++ b/onnxcustom/utils/onnx_function.py
@@ -760,7 +760,7 @@ def _onnx_grad_sigmoid_neg_log_loss_error(target_opset=None,
 
         print("DOT-SECTION", oinf.to_dot())
     """
-    from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+    from onnx.helper import np_dtype_to_tensor_dtype
     from skl2onnx.algebra.onnx_ops import (
         OnnxSub, OnnxMul, OnnxSigmoid, OnnxLog, OnnxNeg,
         OnnxReduceSum, OnnxReshape, OnnxAdd, OnnxCast, OnnxClip)
@@ -771,7 +771,7 @@ def _onnx_grad_sigmoid_neg_log_loss_error(target_opset=None,
                   op_version=target_opset)
     p0 = OnnxSub(numpy.array([1], dtype=dtype), p1,
                  op_version=target_opset)
-    y1 = OnnxCast('X1', to=NP_TYPE_TO_TENSOR_TYPE[numpy.dtype(dtype)],
+    y1 = OnnxCast('X1', to=np_dtype_to_tensor_dtype(numpy.dtype(dtype)),
                   op_version=target_opset)
     y0 = OnnxSub(numpy.array([1], dtype=dtype), y1,
                  op_version=target_opset)

From 5ad7d877461ea165d79dd984ca39899bf0a128bd Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Fri, 11 Nov 2022 13:13:05 +0100
Subject: [PATCH 03/16] Update plot_parallel_execution.py

---
 _doc/examples/plot_parallel_execution.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index 9356ff7a..dafdba96 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -191,7 +191,8 @@ def parallel(N=1):
 
 #######################################
 # It improves but not as much as expected. The number of interactions
-# with python is still too high.
+# with python is still too high. The bigger the model is, the better it
+# should be.
 
 # import matplotlib.pyplot as plt
 # plt.show()

From bb51ea32717fab340dd33f42bdfb77b8fc4e395f Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Fri, 11 Nov 2022 13:23:23 +0100
Subject: [PATCH 04/16] import error

---
 onnxcustom/utils/onnx_function.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/onnxcustom/utils/onnx_function.py b/onnxcustom/utils/onnx_function.py
index 5ae13fcd..93137f11 100644
--- a/onnxcustom/utils/onnx_function.py
+++ b/onnxcustom/utils/onnx_function.py
@@ -760,7 +760,14 @@ def _onnx_grad_sigmoid_neg_log_loss_error(target_opset=None,
 
         print("DOT-SECTION", oinf.to_dot())
     """
-    from onnx.helper import np_dtype_to_tensor_dtype
+    try:
+        from onnx.helper import np_dtype_to_tensor_dtype
+    except ImportError:
+        from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
+        
+        def np_dtype_to_tensor_dtype(dtype):
+            return NP_TYPE_TO_TENSOR_TYPE[dtype]
+        
     from skl2onnx.algebra.onnx_ops import (
         OnnxSub, OnnxMul, OnnxSigmoid, OnnxLog, OnnxNeg,
         OnnxReduceSum, OnnxReshape, OnnxAdd, OnnxCast, OnnxClip)

From 53d9c15b04bb674c8a82d079920bc35abd609c9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?xavier=20dupr=C3=A9?= <xavier.dupre@gmail.com>
Date: Fri, 11 Nov 2022 14:11:58 +0100
Subject: [PATCH 05/16] lint

---
 onnxcustom/utils/onnx_function.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxcustom/utils/onnx_function.py b/onnxcustom/utils/onnx_function.py
index 93137f11..ffa4fd52 100644
--- a/onnxcustom/utils/onnx_function.py
+++ b/onnxcustom/utils/onnx_function.py
@@ -764,10 +764,10 @@ def _onnx_grad_sigmoid_neg_log_loss_error(target_opset=None,
         from onnx.helper import np_dtype_to_tensor_dtype
     except ImportError:
         from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE
-        
+
         def np_dtype_to_tensor_dtype(dtype):
             return NP_TYPE_TO_TENSOR_TYPE[dtype]
-        
+
     from skl2onnx.algebra.onnx_ops import (
         OnnxSub, OnnxMul, OnnxSigmoid, OnnxLog, OnnxNeg,
         OnnxReduceSum, OnnxReshape, OnnxAdd, OnnxCast, OnnxClip)

From a5fe43b83e479a97e90a430999c685ff5b2b941d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?xavier=20dupr=C3=A9?= <xavier.dupre@gmail.com>
Date: Fri, 11 Nov 2022 14:40:24 +0100
Subject: [PATCH 06/16] lint

---
 _doc/examples/plot_parallel_execution.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index dafdba96..95025b23 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -103,6 +103,7 @@ def download_file(url, name, min_size):
 ################################
 # First: sequence
 
+
 def sequence(N=1):
     res = []
     for sess, img in zip(sesss, imgs):
@@ -110,11 +111,13 @@ def sequence(N=1):
             res.append(sess.run(None, {input_name: img})[0])
     return res
 
+
 print(measure_time(sequence, div_by_number=True, repeat=2, number=2))
 
 #################################
 # Second: multitheading
 
+
 class MyThread(threading.Thread):
 
     def __init__(self, sess, imgs):
@@ -140,6 +143,7 @@ def parallel(N=1):
         res.extend(t.q)
     return res
 
+
 print(measure_time(parallel, div_by_number=True, repeat=2, number=2))
 
 ###################################
@@ -149,7 +153,7 @@ def parallel(N=1):
 if len(r_seq) != n_threads * 4:
     raise ValueError(
         f"Unexpected number of results {len(r_seq)} != {n_threads * 4}.")
-r_par = parallel(4) 
+r_par = parallel(4)
 if len(r_par) != n_threads * 4:
     raise ValueError(
         f"Unexpected number of results {len(r_par)} != {n_threads * 4}.")
@@ -196,4 +200,3 @@ def parallel(N=1):
 
 # import matplotlib.pyplot as plt
 # plt.show()
-

From 0f5c76328241fe655bde4e12d09dbece678ac5b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?xavier=20dupr=C3=A9?= <xavier.dupre@gmail.com>
Date: Fri, 11 Nov 2022 19:42:26 +0100
Subject: [PATCH 07/16] Update plot_parallel_execution.py

---
 _doc/examples/plot_parallel_execution.py | 112 ++++++++++++++++++++---
 1 file changed, 98 insertions(+), 14 deletions(-)

diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index 95025b23..aca387c5 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -31,6 +31,8 @@
 import pandas
 from cpyquickhelper.numbers import measure_time
 from onnxruntime import InferenceSession
+from onnxruntime.capi._pybind_state import (  # pylint: disable=E0611
+    SessionIOBinding, OrtDevice as C_OrtDevice)
 
 
 def download_file(url, name, min_size):
@@ -66,21 +68,24 @@ def download_file(url, name, min_size):
 #
 # Let's create a random image.
 
-sess = InferenceSession(model_name, providers=["CPUExecutionProvider"])
-for i in sess.get_inputs():
+sess1 = InferenceSession(model_name, providers=["CPUExecutionProvider"])
+for i in sess1.get_inputs():
     print(f"input {i}, name={i.name!r}, type={i.type}, shape={i.shape}")
     input_name = i.name
     input_shape = list(i.shape)
-    if input_shape[0] in [None, "batch_size"]:
+    if input_shape[0] in [None, "batch_size", "N"]:
         input_shape[0] = 1
+for i in sess1.get_outputs():
+    print(f"output {i}, name={i.name!r}, type={i.type}, shape={i.shape}")
+    output_name = i.name
 
 
 rnd_img = numpy.random.rand(*input_shape).astype(numpy.float32)
 
-res = sess.run(None, {input_name: rnd_img})
+res = sess1.run(None, {input_name: rnd_img})
 print(f"output: type={res[0].dtype}, shape={res[0].shape}")
 
-print(measure_time(lambda: sess.run(None, {input_name: rnd_img}),
+print(measure_time(lambda: sess1.run(None, {input_name: rnd_img}),
                    div_by_number=True, repeat=10, number=10))
 
 #############################################
@@ -128,7 +133,7 @@ def __init__(self, sess, imgs):
 
     def run(self):
         for img in self.imgs:
-            r = sess.run(None, {input_name: img})[0]
+            r = self.sess.run(None, {input_name: img})[0]
             self.q.append(r)
 
 
@@ -165,15 +170,19 @@ def parallel(N=1):
 # Let's increase again.
 
 data = []
-for N in tqdm.tqdm(range(1, 21)):
+rep = 2
+maxN = 18
+for N in tqdm.tqdm(range(1, maxN, 2)):
     begin = time.perf_counter()
-    res1 = sequence(N)
-    end = time.perf_counter() - begin
+    for i in range(rep):
+        res1 = sequence(N)
+    end = (time.perf_counter() - begin) / rep
     obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end)
 
     begin = time.perf_counter()
-    res2 = parallel(N)
-    end = time.perf_counter() - begin
+    for i in range(rep):
+        res2 = parallel(N)
+    end = (time.perf_counter() - begin) / rep
     obs.update(dict(n_imgs_par=len(res2), time_par=end))
 
     data.append(obs)
@@ -194,9 +203,84 @@ def parallel(N=1):
 ax.set_ylabel("s")
 
 #######################################
-# It improves but not as much as expected. The number of interactions
+# It does not really improve. The number of interactions
 # with python is still too high. The bigger the model is, the better it
 # should be.
 
-# import matplotlib.pyplot as plt
-# plt.show()
+###################################################
+# With another API
+# ++++++++++++++++
+
+
+class MyThreadBind(threading.Thread):
+
+    def __init__(self, sess, imgs):
+        threading.Thread.__init__(self)
+        self.sess = sess
+        self.imgs = imgs
+        self.q = []
+        self.bind = SessionIOBinding(self.sess._sess)
+        self.ort_device = C_OrtDevice(
+            C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)
+
+    def run(self):
+        bind = self.bind
+        ort_device = self.ort_device
+        bind.bind_output(output_name, ort_device)
+        sess = self.sess._sess
+        q = self.q
+        for img in self.imgs:
+            bind.bind_input(input_name, ort_device,
+                            img.dtype, img.shape,
+                            img.__array_interface__['data'][0])
+            sess.run_with_iobinding(bind, None)
+            ortvalues = bind.get_outputs()
+            q.append(ortvalues)
+
+
+def parallel_bind(N=1):
+    threads = [MyThreadBind(sess, [img] * N)
+               for sess, img in zip(sesss, imgs)]
+    for t in threads:
+        t.start()
+    res = []
+    for t in threads:
+        t.join()
+        res.extend(t.q)
+    return res
+
+
+data = []
+for N in tqdm.tqdm(range(1, maxN, 2)):
+    begin = time.perf_counter()
+    for i in range(rep):
+        res1 = sequence(N)
+    end = (time.perf_counter() - begin) / rep
+    obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end)
+
+    begin = time.perf_counter()
+    for i in range(rep):
+        res2 = parallel_bind(N)
+    end = (time.perf_counter() - begin) / rep
+    obs.update(dict(n_imgs_par=len(res2), time_par=end))
+
+    data.append(obs)
+
+df = pandas.DataFrame(data)
+df
+
+############################
+# Plots
+# +++++
+
+
+df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"]
+df["time_par_img"] = df["time_par"] / df["n_imgs_par"]
+
+ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot(
+    title="Time per image / batch size\nrun_with_iobinding")
+ax.set_xlabel("batch size")
+ax.set_ylabel("s")
+
+import matplotlib.pyplot as plt
+plt.show()

From 3f1001b4da191ca42b9f9732050a55815bf10faa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?xavier=20dupr=C3=A9?= <xavier.dupre@gmail.com>
Date: Fri, 11 Nov 2022 19:42:50 +0100
Subject: [PATCH 08/16] Update plot_parallel_execution.py

---
 _doc/examples/plot_parallel_execution.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index aca387c5..a85d1249 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -232,7 +232,7 @@ def run(self):
         for img in self.imgs:
             bind.bind_input(input_name, ort_device,
                             img.dtype, img.shape,
-                            img.__array_interface__['data'][0])
+                            img.__array_interface__['data'][0])            
             sess.run_with_iobinding(bind, None)
             ortvalues = bind.get_outputs()
             q.append(ortvalues)
@@ -282,5 +282,5 @@ def parallel_bind(N=1):
 ax.set_xlabel("batch size")
 ax.set_ylabel("s")
 
-import matplotlib.pyplot as plt
-plt.show()
+# import matplotlib.pyplot as plt
+# plt.show()

From c1ba594c61d756a38fa2cd30248be9e9e4573599 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?xavier=20dupr=C3=A9?= <xavier.dupre@gmail.com>
Date: Fri, 11 Nov 2022 19:59:06 +0100
Subject: [PATCH 09/16] Update plot_parallel_execution.py

---
 _doc/examples/plot_parallel_execution.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index a85d1249..688b92c4 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -232,7 +232,7 @@ def run(self):
         for img in self.imgs:
             bind.bind_input(input_name, ort_device,
                             img.dtype, img.shape,
-                            img.__array_interface__['data'][0])            
+                            img.__array_interface__['data'][0])
             sess.run_with_iobinding(bind, None)
             ortvalues = bind.get_outputs()
             q.append(ortvalues)

From c98f7942e27c0043eee28cca2162775111583fc6 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Sat, 12 Nov 2022 12:41:29 +0100
Subject: [PATCH 10/16] documentation and examples

---
 _doc/examples/plot_parallel_execution.py      | 223 +++++++++++++-----
 .../source/api/onnxruntime_python/index.rst   |   2 +-
 .../tutorial_onnxruntime/ortvalue_doc.rst     |   2 +
 3 files changed, 169 insertions(+), 58 deletions(-)

diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index 688b92c4..fd07ec6a 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -1,6 +1,7 @@
 """
 .. _l-plot-parallel-execution:
 
+===============================
 Multithreading with onnxruntime
 ===============================
 
@@ -11,13 +12,16 @@
 python object, this option becomes more interesting than creating several processes
 trying to exchange data through sockets. :epkg:`onnxruntime` falls into that category.
 For a big model such as a deeplearning model, this might be interesting.
-This example verifies this scenario.
+However, :epkg:`onnxruntime` already parallelize the computation of
+every operator (Gemm, MatMul) using all the CPU it can get so this approach
+should show significant result when used on different processors (CPU, GPU)
+in parallel.
 
 .. contents::
     :local:
 
 A model
-+++++++
+=======
 
 Let's retrieve a not so big model.
 """
@@ -30,7 +34,8 @@
 import numpy
 import pandas
 from cpyquickhelper.numbers import measure_time
-from onnxruntime import InferenceSession
+import torch.cuda
+from onnxruntime import InferenceSession, get_all_providers
 from onnxruntime.capi._pybind_state import (  # pylint: disable=E0611
     SessionIOBinding, OrtDevice as C_OrtDevice)
 
@@ -63,8 +68,11 @@ def download_file(url, name, min_size):
 download_file(url_name, model_name, 100000)
 
 #############################################
-# Measuring inference time
-# ++++++++++++++++++++++++
+# Measuring inference time when parallelizing on CPU
+# ==================================================
+#
+# Sequence
+# ++++++++
 #
 # Let's create a random image.
 
@@ -106,10 +114,10 @@ def download_file(url, name, min_size):
          for i in range(n_threads)]
 
 ################################
-# First: sequence
+# Let's measure the time for a sequence of images.
 
 
-def sequence(N=1):
+def sequence(sesss, imgs, N=1):
     res = []
     for sess, img in zip(sesss, imgs):
         for i in range(N):
@@ -117,10 +125,11 @@ def sequence(N=1):
     return res
 
 
-print(measure_time(sequence, div_by_number=True, repeat=2, number=2))
+print(measure_time(lambda: sequence(sesss, imgs),
+                   div_by_number=True, repeat=2, number=2))
 
 #################################
-# Second: multitheading
+# And then with multithreading.
 
 
 class MyThread(threading.Thread):
@@ -137,7 +146,7 @@ def run(self):
             self.q.append(r)
 
 
-def parallel(N=1):
+def parallel(sesss, imgs, N=1):
     threads = [MyThread(sess, [img] * N)
                for sess, img in zip(sesss, imgs)]
     for t in threads:
@@ -149,25 +158,12 @@ def parallel(N=1):
     return res
 
 
-print(measure_time(parallel, div_by_number=True, repeat=2, number=2))
+print(measure_time(lambda: parallel(sesss, imgs),
+                   div_by_number=True, repeat=2, number=2))
 
 ###################################
-# It is worse for one image. Let's increase the number of images to parallelize.
-
-r_seq = sequence(4)
-if len(r_seq) != n_threads * 4:
-    raise ValueError(
-        f"Unexpected number of results {len(r_seq)} != {n_threads * 4}.")
-r_par = parallel(4)
-if len(r_par) != n_threads * 4:
-    raise ValueError(
-        f"Unexpected number of results {len(r_par)} != {n_threads * 4}.")
-
-print(measure_time(lambda: sequence(4), div_by_number=True, repeat=2, number=2))
-print(measure_time(lambda: parallel(4), div_by_number=True, repeat=2, number=2))
-
-####################################
-# Let's increase again.
+# It is worse for one image. It is expected as mentioned in the introduction.
+# Let's check for different number of images to parallelize.
 
 data = []
 rep = 2
@@ -175,13 +171,13 @@ def parallel(N=1):
 for N in tqdm.tqdm(range(1, maxN, 2)):
     begin = time.perf_counter()
     for i in range(rep):
-        res1 = sequence(N)
+        res1 = sequence(sesss, imgs, N)
     end = (time.perf_counter() - begin) / rep
     obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end)
 
     begin = time.perf_counter()
     for i in range(rep):
-        res2 = parallel(N)
+        res2 = parallel(sesss, imgs, N)
     end = (time.perf_counter() - begin) / rep
     obs.update(dict(n_imgs_par=len(res2), time_par=end))
 
@@ -191,37 +187,44 @@ def parallel(N=1):
 df
 
 ##########################################
-# Plotting
-# ++++++++
+# Plots
+# +++++
 
-df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"]
-df["time_par_img"] = df["time_par"] / df["n_imgs_par"]
 
-ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot(
-    title="Time per image / batch size")
-ax.set_xlabel("batch size")
-ax.set_ylabel("s")
+def make_plot(df, title):
+    df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"]
+    df["time_par_img"] = df["time_par"] / df["n_imgs_par"]
+
+    ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot(
+        title=title)
+    ax.set_xlabel("batch size")
+    ax.set_ylabel("s")
+    return ax
+
+
+make_plot(df, "Time per image / batch size")
 
 #######################################
-# It does not really improve. The number of interactions
-# with python is still too high. The bigger the model is, the better it
-# should be.
+# As expected, it does not really improve. It is like parallezing using
+# both strategies, per kernel and per image, both trying to access all
+# the process cores.
 
 ###################################################
-# With another API
-# ++++++++++++++++
+# Same with another API based on OrtValue
+# +++++++++++++++++++++++++++++++++++++++
+#
+# See :epkg:`l-ortvalue-doc`.
 
 
 class MyThreadBind(threading.Thread):
 
-    def __init__(self, sess, imgs):
+    def __init__(self, sess, imgs, ort_device):
         threading.Thread.__init__(self)
         self.sess = sess
         self.imgs = imgs
         self.q = []
         self.bind = SessionIOBinding(self.sess._sess)
-        self.ort_device = C_OrtDevice(
-            C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)
+        self.ort_device = ort_device
 
     def run(self):
         bind = self.bind
@@ -238,8 +241,10 @@ def run(self):
             q.append(ortvalues)
 
 
-def parallel_bind(N=1):
-    threads = [MyThreadBind(sess, [img] * N)
+def parallel_bind(sesss, imgs, N=1):
+    ort_device = C_OrtDevice(
+        C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)
+    threads = [MyThreadBind(sess, [img] * N, ort_device)
                for sess, img in zip(sesss, imgs)]
     for t in threads:
         t.start()
@@ -254,13 +259,13 @@ def parallel_bind(N=1):
 for N in tqdm.tqdm(range(1, maxN, 2)):
     begin = time.perf_counter()
     for i in range(rep):
-        res1 = sequence(N)
+        res1 = sequence(sesss, imgs, N)
     end = (time.perf_counter() - begin) / rep
     obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end)
 
     begin = time.perf_counter()
     for i in range(rep):
-        res2 = parallel_bind(N)
+        res2 = parallel_bind(sesss, imgs, N)
     end = (time.perf_counter() - begin) / rep
     obs.update(dict(n_imgs_par=len(res2), time_par=end))
 
@@ -270,17 +275,121 @@ def parallel_bind(N=1):
 df
 
 ############################
-# Plots
-# +++++
+# Plots.
+
+make_plot(df, "Time per image / batch size\nrun_with_iobinding")
+
+########################################
+# GPU
+# ===
+#
+# Let's check first it is possible.
+
+has_cuda = "CUDAExecutionProvider" in get_all_providers()
+if not has_cuda:
+    print(f"No CUDA provider was detected in {get_all_providers()}.")
+
+#########################################
+# Parallelization GPU + CPU
+# +++++++++++++++++++++++++
+
+if has_cuda:
+    n_threads = 2
+    sesss = [InferenceSession(model_name, providers=["CPUExecutionProvider"]),
+             InferenceSession(model_name, providers=["CUDAExecutionProvider",
+                                                     "CPUExecutionProvider"])]
+    imgs = [numpy.random.rand(*input_shape).astype(numpy.float32)
+            for i in range(n_threads)]
+
+    data = []
+    for N in tqdm.tqdm(range(1, maxN, 2)):
+        begin = time.perf_counter()
+        for i in range(rep):
+            res1 = sequence(sesss, imgs, N)
+        end = (time.perf_counter() - begin) / rep
+        obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end)
+
+        begin = time.perf_counter()
+        for i in range(rep):
+            res2 = parallel_bind(sesss, imgs, N)
+        end = (time.perf_counter() - begin) / rep
+        obs.update(dict(n_imgs_par=len(res2), time_par=end))
+
+        data.append(obs)
+
+    df = pandas.DataFrame(data)
+    df.to_csv("ort_cpu_gpu.csv", index=False)
+else:
+    df = None
+
+df
+
+####################################
+# Plots.
+
+if has_cuda:
+    ax = make_plot(df, "Time per image / batch size\nCPU + GPU")
+else:
+    ax = None
+
+ax
+
+#########################################
+# Parallelization on multiple GPUs
+# ++++++++++++++++++++++++++++++++
 
+n_gpus = torch.cuda.device_count() if has_cuda else 0
+if n_gpus == 0:
+    print("No GPU or one GPU was detected.")
+elif n_gpus == 1:
+    print("1 GPU was detected.")
+else:
+    print(f"{n_gpus} were detected.")
+
+
+if n_gpus > 1:
+    n_threads = 2
+    sesss = [InferenceSession(model_name, providers=["CUDAExecutionProvider",
+                                                     "CPUExecutionProvider"],
+                              provider_options={"device_id": i})
+             for i in range(n_gpus)]
+    imgs = [numpy.random.rand(*input_shape).astype(numpy.float32)
+            for i in range(n_threads)]
+
+    data = []
+    for N in tqdm.tqdm(range(1, maxN, 2)):
+        begin = time.perf_counter()
+        for i in range(rep):
+            res1 = sequence(sesss, imgs, N)
+        end = (time.perf_counter() - begin) / rep
+        obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end)
+
+        begin = time.perf_counter()
+        for i in range(rep):
+            res2 = parallel_bind(sesss, imgs, N)
+        end = (time.perf_counter() - begin) / rep
+        obs.update(dict(n_imgs_par=len(res2), time_par=end))
+
+        data.append(obs)
+
+    df = pandas.DataFrame(data)
+    df.to_csv("ort_gpus.csv", index=False)
+else:
+    df = None
+
+df
+
+
+####################################
+# Plots.
+
+if n_gpus > 1:
+    ax = make_plot(df, "Time per image / batch size\nCPU + GPU")
+else:
+    ax = None
 
-df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"]
-df["time_par_img"] = df["time_par"] / df["n_imgs_par"]
+ax
 
-ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot(
-    title="Time per image / batch size\nrun_with_iobinding")
-ax.set_xlabel("batch size")
-ax.set_ylabel("s")
 
 # import matplotlib.pyplot as plt
 # plt.show()
diff --git a/_doc/sphinxdoc/source/api/onnxruntime_python/index.rst b/_doc/sphinxdoc/source/api/onnxruntime_python/index.rst
index 45e6dcbf..e8ade2c9 100644
--- a/_doc/sphinxdoc/source/api/onnxruntime_python/index.rst
+++ b/_doc/sphinxdoc/source/api/onnxruntime_python/index.rst
@@ -4,7 +4,7 @@ Summary of onnxruntime and onnxruntime-training API
 
 Module :epkg:`onnxcustom` leverages :epkg:`onnxruntime-training` to train models.
 Next sections exposes frequent functions uses to run inference
-and training with :epkg:`onnxruntime` and :epkg:`onnxruntume-training`.
+and training with :epkg:`onnxruntime` and :epkg:`onnxruntime-training`.
 
 Most of the code in :epkg:`onnxruntime` is written in C++ and exposed
 in Python using :epkg:`pybind11`. For inference, the main class
diff --git a/_doc/sphinxdoc/source/tutorials/tutorial_onnxruntime/ortvalue_doc.rst b/_doc/sphinxdoc/source/tutorials/tutorial_onnxruntime/ortvalue_doc.rst
index 8cca3286..65a1a6db 100644
--- a/_doc/sphinxdoc/source/tutorials/tutorial_onnxruntime/ortvalue_doc.rst
+++ b/_doc/sphinxdoc/source/tutorials/tutorial_onnxruntime/ortvalue_doc.rst
@@ -1,4 +1,6 @@
 
+.. _l-ortvalue-doc:
+
 ========
 OrtValue
 ========

From d4346349f3ae2e50e960bd3916588404b0c20aff Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 13 Nov 2022 08:50:08 +0000
Subject: [PATCH 11/16] fix example

Signed-off-by: Xavier Dupre <xadupre@microsoft.com>
---
 _doc/examples/data/ort_cpu_gpu.csv       | 11 +++++
 _doc/examples/data/ort_gpus.csv          | 11 +++++
 _doc/examples/plot_parallel_execution.py | 51 +++++++++++++++++++-----
 3 files changed, 64 insertions(+), 9 deletions(-)
 create mode 100644 _doc/examples/data/ort_cpu_gpu.csv
 create mode 100644 _doc/examples/data/ort_gpus.csv

diff --git a/_doc/examples/data/ort_cpu_gpu.csv b/_doc/examples/data/ort_cpu_gpu.csv
new file mode 100644
index 00000000..826d8139
--- /dev/null
+++ b/_doc/examples/data/ort_cpu_gpu.csv
@@ -0,0 +1,11 @@
+N,n_imgs_seq,time_seq,n_imgs_par,time_par
+1,2,0.5718123729893705,2,0.007878101489041
+3,6,0.016015594999771565,6,0.015513360500335693
+5,10,0.024301433499203995,10,0.023472609493182972
+7,14,0.03465705699636601,14,0.03860543250630144
+9,18,0.04403566100518219,18,0.04680142400320619
+11,22,0.06553528150834609,22,0.05793282100057695
+13,26,0.08153052550915163,26,0.05927092849742621
+15,30,0.0768452735064784,30,0.08342061600706074
+17,34,0.09201569500146434,34,0.08902498900715727
+19,38,0.11011747350858059,38,0.09492045298975427
diff --git a/_doc/examples/data/ort_gpus.csv b/_doc/examples/data/ort_gpus.csv
new file mode 100644
index 00000000..30f083c4
--- /dev/null
+++ b/_doc/examples/data/ort_gpus.csv
@@ -0,0 +1,11 @@
+N,n_imgs_seq,time_seq,n_imgs_par,time_par
+1,2,0.7859347729972797,2,0.0027880600100615993
+3,6,0.01325492349860724,6,0.005836712007294409
+5,10,0.02211888850433752,10,0.008998960503959097
+7,14,0.027732157497666776,14,0.012500747499871068
+9,18,0.03999921299691778,18,0.015629247500328347
+11,22,0.048589186000754125,22,0.019298979008453898
+13,26,0.05781353948987089,26,0.022477426507975906
+15,30,0.0516846864920808,30,0.02620260650292039
+17,34,0.076614134493866,34,0.02977054200891871
+19,38,0.08429743749729823,38,0.03443809199961834
diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index fd07ec6a..f0bc2ef0 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -25,6 +25,7 @@
 
 Let's retrieve a not so big model.
 """
+import gc
 import multiprocessing
 import os
 import urllib.request
@@ -103,7 +104,7 @@ def download_file(url, name, min_size):
 # We define the number of threads as the number of cores divided by 2.
 # This is a dummy value. It should let a core to handle the main program.
 
-n_threads = multiprocessing.cpu_count() - 1
+n_threads = min(8, multiprocessing.cpu_count() - 1)
 print(f"n_threads={n_threads}")
 
 
@@ -161,13 +162,15 @@ def parallel(sesss, imgs, N=1):
 print(measure_time(lambda: parallel(sesss, imgs),
                    div_by_number=True, repeat=2, number=2))
 
+
 ###################################
 # It is worse for one image. It is expected as mentioned in the introduction.
 # Let's check for different number of images to parallelize.
 
+print("ORT // CPU")
 data = []
 rep = 2
-maxN = 18
+maxN = 21
 for N in tqdm.tqdm(range(1, maxN, 2)):
     begin = time.perf_counter()
     for i in range(rep):
@@ -184,8 +187,15 @@ def parallel(sesss, imgs, N=1):
     data.append(obs)
 
 df = pandas.DataFrame(data)
+df.to_csv("ort_cpu.csv", index=False)
 df
 
+#####################################
+# Let's free the memory.
+
+del sesss[:]
+gc.collect()
+
 ##########################################
 # Plots
 # +++++
@@ -255,6 +265,7 @@ def parallel_bind(sesss, imgs, N=1):
     return res
 
 
+print("ORT (bind) // CPU")
 data = []
 for N in tqdm.tqdm(range(1, maxN, 2)):
     begin = time.perf_counter()
@@ -271,9 +282,13 @@ def parallel_bind(sesss, imgs, N=1):
 
     data.append(obs)
 
+del sesss[:]
+gc.collect()
 df = pandas.DataFrame(data)
+df.to_csv("ort_cpu_bind.csv", index=False)
 df
 
+
 ############################
 # Plots.
 
@@ -301,6 +316,7 @@ def parallel_bind(sesss, imgs, N=1):
     imgs = [numpy.random.rand(*input_shape).astype(numpy.float32)
             for i in range(n_threads)]
 
+    print("ORT // CPU + GPU")
     data = []
     for N in tqdm.tqdm(range(1, maxN, 2)):
         begin = time.perf_counter()
@@ -317,10 +333,13 @@ def parallel_bind(sesss, imgs, N=1):
 
         data.append(obs)
 
+    del sesss[:]
+    gc.collect()
     df = pandas.DataFrame(data)
     df.to_csv("ort_cpu_gpu.csv", index=False)
 else:
-    df = None
+    print("No GPU is available but it should show something like the following.")
+    df = pandas.read_csv("data/ort_cpu_gpu.csv")
 
 df
 
@@ -334,6 +353,11 @@ def parallel_bind(sesss, imgs, N=1):
 
 ax
 
+####################################
+# The parallelization on mulitple CPU + GPUs is not really
+# working with this simple setting. GPU should take more
+# images as it is faster.
+
 #########################################
 # Parallelization on multiple GPUs
 # ++++++++++++++++++++++++++++++++
@@ -344,18 +368,22 @@ def parallel_bind(sesss, imgs, N=1):
 elif n_gpus == 1:
     print("1 GPU was detected.")
 else:
-    print(f"{n_gpus} were detected.")
+    print(f"{n_gpus} GPUs were detected.")
 
 
 if n_gpus > 1:
     n_threads = 2
-    sesss = [InferenceSession(model_name, providers=["CUDAExecutionProvider",
-                                                     "CPUExecutionProvider"],
-                              provider_options={"device_id": i})
-             for i in range(n_gpus)]
+    sesss = []
+    for i in range(n_gpus):
+        print(f"Initialize device {i}")
+        sesss.append(
+            InferenceSession(model_name, providers=["CUDAExecutionProvider",
+                                                    "CPUExecutionProvider"],
+                             provider_options=[{"device_id": i}, {}]))
     imgs = [numpy.random.rand(*input_shape).astype(numpy.float32)
             for i in range(n_threads)]
 
+    print("ORT // GPUs")
     data = []
     for N in tqdm.tqdm(range(1, maxN, 2)):
         begin = time.perf_counter()
@@ -372,10 +400,13 @@ def parallel_bind(sesss, imgs, N=1):
 
         data.append(obs)
 
+    del sesss[:]
+    gc.collect()
     df = pandas.DataFrame(data)
     df.to_csv("ort_gpus.csv", index=False)
 else:
-    df = None
+    print("No GPU is available but it should show something like the following.")
+    df = pandas.read_csv("data/ort_gpus.csv")
 
 df
 
@@ -390,6 +421,8 @@ def parallel_bind(sesss, imgs, N=1):
 
 ax
 
+####################################
+# The parallelization on mulitple GPUs did work.
 
 # import matplotlib.pyplot as plt
 # plt.show()

From 7b44628015f211a0d689560947dcd7483891c169 Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Sun, 13 Nov 2022 08:54:47 +0000
Subject: [PATCH 12/16] update

---
 _doc/examples/data/ort_cpu_gpu.csv       | 20 ++++++++++----------
 _doc/examples/data/ort_gpus.csv          | 20 ++++++++++----------
 _doc/examples/plot_parallel_execution.py |  8 ++++----
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/_doc/examples/data/ort_cpu_gpu.csv b/_doc/examples/data/ort_cpu_gpu.csv
index 826d8139..97510823 100644
--- a/_doc/examples/data/ort_cpu_gpu.csv
+++ b/_doc/examples/data/ort_cpu_gpu.csv
@@ -1,11 +1,11 @@
 N,n_imgs_seq,time_seq,n_imgs_par,time_par
-1,2,0.5718123729893705,2,0.007878101489041
-3,6,0.016015594999771565,6,0.015513360500335693
-5,10,0.024301433499203995,10,0.023472609493182972
-7,14,0.03465705699636601,14,0.03860543250630144
-9,18,0.04403566100518219,18,0.04680142400320619
-11,22,0.06553528150834609,22,0.05793282100057695
-13,26,0.08153052550915163,26,0.05927092849742621
-15,30,0.0768452735064784,30,0.08342061600706074
-17,34,0.09201569500146434,34,0.08902498900715727
-19,38,0.11011747350858059,38,0.09492045298975427
+1,2,0.5849514909932623,2,0.02292487349768635
+3,6,0.01547511100943666,6,0.01833898900076747
+5,10,0.036359535486553796,10,0.0324736335023772
+7,14,0.04304601749754511,14,0.039623103497433476
+9,18,0.06215578749834094,18,0.04699261850328185
+11,22,0.0860430364991771,22,0.05861812600051053
+13,26,0.0699024919886142,26,0.06959964999987278
+15,30,0.09830122849962208,30,0.08662367198849097
+17,34,0.11123420301009901,34,0.10407247900729999
+19,38,0.10265450000588316,38,0.09627137100324035
diff --git a/_doc/examples/data/ort_gpus.csv b/_doc/examples/data/ort_gpus.csv
index 30f083c4..6d00bee4 100644
--- a/_doc/examples/data/ort_gpus.csv
+++ b/_doc/examples/data/ort_gpus.csv
@@ -1,11 +1,11 @@
 N,n_imgs_seq,time_seq,n_imgs_par,time_par
-1,2,0.7859347729972797,2,0.0027880600100615993
-3,6,0.01325492349860724,6,0.005836712007294409
-5,10,0.02211888850433752,10,0.008998960503959097
-7,14,0.027732157497666776,14,0.012500747499871068
-9,18,0.03999921299691778,18,0.015629247500328347
-11,22,0.048589186000754125,22,0.019298979008453898
-13,26,0.05781353948987089,26,0.022477426507975906
-15,30,0.0516846864920808,30,0.02620260650292039
-17,34,0.076614134493866,34,0.02977054200891871
-19,38,0.08429743749729823,38,0.03443809199961834
+1,2,0.7691331224923488,2,0.0028955735033378005
+3,6,0.010057882987894118,6,0.005550952511839569
+5,10,0.017624731990508735,10,0.009006410502479412
+7,14,0.030596987504395656,14,0.012265623998246156
+9,18,0.0391014114866266,18,0.015631284506525844
+11,22,0.047721832495881245,22,0.0192174395051552
+13,26,0.04471722950984258,26,0.02277063950896263
+15,30,0.06558763500652276,30,0.025715417010360397
+17,34,0.07659106400387827,34,0.029424325504805893
+19,38,0.07083825548761524,38,0.03291438950691372
diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index f0bc2ef0..e410bd4a 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -187,7 +187,7 @@ def parallel(sesss, imgs, N=1):
     data.append(obs)
 
 df = pandas.DataFrame(data)
-df.to_csv("ort_cpu.csv", index=False)
+df.reset_index(drop=False).to_csv("ort_cpu.csv", index=False)
 df
 
 #####################################
@@ -285,7 +285,7 @@ def parallel_bind(sesss, imgs, N=1):
 del sesss[:]
 gc.collect()
 df = pandas.DataFrame(data)
-df.to_csv("ort_cpu_bind.csv", index=False)
+df.reset_index(drop=False).to_csv("ort_cpu_bind.csv", index=False)
 df
 
 
@@ -336,7 +336,7 @@ def parallel_bind(sesss, imgs, N=1):
     del sesss[:]
     gc.collect()
     df = pandas.DataFrame(data)
-    df.to_csv("ort_cpu_gpu.csv", index=False)
+    df.reset_index(drop=False).to_csv("ort_cpu_gpu.csv", index=False)
 else:
     print("No GPU is available but it should show something like the following.")
     df = pandas.read_csv("data/ort_cpu_gpu.csv")
@@ -403,7 +403,7 @@ def parallel_bind(sesss, imgs, N=1):
     del sesss[:]
     gc.collect()
     df = pandas.DataFrame(data)
-    df.to_csv("ort_gpus.csv", index=False)
+    df.reset_index(drop=False).to_csv("ort_gpus.csv", index=False)
 else:
     print("No GPU is available but it should show something like the following.")
     df = pandas.read_csv("data/ort_gpus.csv")

From 307e62b3d1826daf2d998dd4b05c5358418af204 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Sun, 13 Nov 2022 09:56:18 +0100
Subject: [PATCH 13/16] Update plot_parallel_execution.py

---
 _doc/examples/plot_parallel_execution.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index e410bd4a..80af19b9 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -339,7 +339,7 @@ def parallel_bind(sesss, imgs, N=1):
     df.reset_index(drop=False).to_csv("ort_cpu_gpu.csv", index=False)
 else:
     print("No GPU is available but it should show something like the following.")
-    df = pandas.read_csv("data/ort_cpu_gpu.csv")
+    df = pandas.read_csv("data/ort_cpu_gpu.csv").set_index("N")
 
 df
 
@@ -406,7 +406,7 @@ def parallel_bind(sesss, imgs, N=1):
     df.reset_index(drop=False).to_csv("ort_gpus.csv", index=False)
 else:
     print("No GPU is available but it should show something like the following.")
-    df = pandas.read_csv("data/ort_gpus.csv")
+    df = pandas.read_csv("data/ort_gpus.csv").set_index("N")
 
 df
 

From 431719468f0d224391873a3903248c1deb6c2a7e Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Sun, 13 Nov 2022 10:02:47 +0100
Subject: [PATCH 14/16] Update plot_parallel_execution.py

---
 _doc/examples/plot_parallel_execution.py | 55 ++++++++++--------------
 1 file changed, 22 insertions(+), 33 deletions(-)

diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index 80af19b9..e05c6e28 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -190,12 +190,6 @@ def parallel(sesss, imgs, N=1):
 df.reset_index(drop=False).to_csv("ort_cpu.csv", index=False)
 df
 
-#####################################
-# Let's free the memory.
-
-del sesss[:]
-gc.collect()
-
 ##########################################
 # Plots
 # +++++
@@ -206,7 +200,7 @@ def make_plot(df, title):
     df["time_par_img"] = df["time_par"] / df["n_imgs_par"]
 
     ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot(
-        title=title)
+        title=title, logy=True)
     ax.set_xlabel("batch size")
     ax.set_ylabel("s")
     return ax
@@ -282,12 +276,15 @@ def parallel_bind(sesss, imgs, N=1):
 
     data.append(obs)
 
-del sesss[:]
-gc.collect()
 df = pandas.DataFrame(data)
 df.reset_index(drop=False).to_csv("ort_cpu_bind.csv", index=False)
 df
 
+#####################################
+# Let's free the memory.
+
+del sesss[:]
+gc.collect()
 
 ############################
 # Plots.
@@ -303,12 +300,21 @@ def parallel_bind(sesss, imgs, N=1):
 has_cuda = "CUDAExecutionProvider" in get_all_providers()
 if not has_cuda:
     print(f"No CUDA provider was detected in {get_all_providers()}.")
+    
+n_gpus = torch.cuda.device_count() if has_cuda else 0
+if n_gpus == 0:
+    print("No GPU or one GPU was detected.")
+elif n_gpus == 1:
+    print("1 GPU was detected.")
+else:
+    print(f"{n_gpus} GPUs were detected.")
+
 
 #########################################
 # Parallelization GPU + CPU
 # +++++++++++++++++++++++++
 
-if has_cuda:
+if has_cuda and n_gpus > 0:
     n_threads = 2
     sesss = [InferenceSession(model_name, providers=["CPUExecutionProvider"]),
              InferenceSession(model_name, providers=["CUDAExecutionProvider",
@@ -338,7 +344,7 @@ def parallel_bind(sesss, imgs, N=1):
     df = pandas.DataFrame(data)
     df.reset_index(drop=False).to_csv("ort_cpu_gpu.csv", index=False)
 else:
-    print("No GPU is available but it should show something like the following.")
+    print("No GPU is available but data should be like the following.")
     df = pandas.read_csv("data/ort_cpu_gpu.csv").set_index("N")
 
 df
@@ -346,11 +352,7 @@ def parallel_bind(sesss, imgs, N=1):
 ####################################
 # Plots.
 
-if has_cuda:
-    ax = make_plot(df, "Time per image / batch size\nCPU + GPU")
-else:
-    ax = None
-
+ax = make_plot(df, "Time per image / batch size\nCPU + GPU")
 ax
 
 ####################################
@@ -362,15 +364,6 @@ def parallel_bind(sesss, imgs, N=1):
 # Parallelization on multiple GPUs
 # ++++++++++++++++++++++++++++++++
 
-n_gpus = torch.cuda.device_count() if has_cuda else 0
-if n_gpus == 0:
-    print("No GPU or one GPU was detected.")
-elif n_gpus == 1:
-    print("1 GPU was detected.")
-else:
-    print(f"{n_gpus} GPUs were detected.")
-
-
 if n_gpus > 1:
     n_threads = 2
     sesss = []
@@ -405,7 +398,7 @@ def parallel_bind(sesss, imgs, N=1):
     df = pandas.DataFrame(data)
     df.reset_index(drop=False).to_csv("ort_gpus.csv", index=False)
 else:
-    print("No GPU is available but it should show something like the following.")
+    print("No GPU is available but data should be like the following.")
     df = pandas.read_csv("data/ort_gpus.csv").set_index("N")
 
 df
@@ -414,15 +407,11 @@ def parallel_bind(sesss, imgs, N=1):
 ####################################
 # Plots.
 
-if n_gpus > 1:
-    ax = make_plot(df, "Time per image / batch size\nCPU + GPU")
-else:
-    ax = None
-
+ax = make_plot(df, f"Time per image / batch size\n{n_gpus} GPUs")
 ax
 
 ####################################
 # The parallelization on mulitple GPUs did work.
 
-# import matplotlib.pyplot as plt
-# plt.show()
+import matplotlib.pyplot as plt
+plt.show()

From bbb3aea45d13c89186cd346db4b60461a8f20d4b Mon Sep 17 00:00:00 2001
From: Xavier Dupre <xadupre@microsoft.com>
Date: Mon, 14 Nov 2022 08:06:42 +0000
Subject: [PATCH 15/16] update

---
 _doc/examples/data/ort_cpu_gpu.csv       | 22 +++----
 _doc/examples/plot_parallel_execution.py | 78 ++++++++++++++++--------
 2 files changed, 63 insertions(+), 37 deletions(-)

diff --git a/_doc/examples/data/ort_cpu_gpu.csv b/_doc/examples/data/ort_cpu_gpu.csv
index 97510823..562954e5 100644
--- a/_doc/examples/data/ort_cpu_gpu.csv
+++ b/_doc/examples/data/ort_cpu_gpu.csv
@@ -1,11 +1,11 @@
-N,n_imgs_seq,time_seq,n_imgs_par,time_par
-1,2,0.5849514909932623,2,0.02292487349768635
-3,6,0.01547511100943666,6,0.01833898900076747
-5,10,0.036359535486553796,10,0.0324736335023772
-7,14,0.04304601749754511,14,0.039623103497433476
-9,18,0.06215578749834094,18,0.04699261850328185
-11,22,0.0860430364991771,22,0.05861812600051053
-13,26,0.0699024919886142,26,0.06959964999987278
-15,30,0.09830122849962208,30,0.08662367198849097
-17,34,0.11123420301009901,34,0.10407247900729999
-19,38,0.10265450000588316,38,0.09627137100324035
+index,N,n_imgs_seq_cpu,time_seq_cpu,n_imgs_seq_gpu,time_seq_gpu,n_imgs_par,time_par
+0,1,2,0.00826602200686466,2,0.5539164490037365,2,0.008887254502042197
+1,3,6,0.019666356995003298,6,0.010879299501539208,6,0.02050348349439446
+2,5,10,0.03761136099637952,10,0.024563621496781707,10,0.025345650006784126
+3,7,14,0.05642429149884265,14,0.03381696599535644,14,0.03412535750248935
+4,9,18,0.061089862501830794,18,0.032227409988990985,18,0.051062139493296854
+5,11,22,0.08963397399929818,22,0.03988744800153654,22,0.049899421486770734
+6,13,26,0.08532479300629348,26,0.059844546995009296,26,0.06177879350434523
+7,15,30,0.099295007501496,30,0.060137088992632926,30,0.07637454000359867
+8,17,34,0.11972474250069354,34,0.08182918949751183,34,0.0649502059968654
+9,19,38,0.1271352384937927,38,0.06829091400140896,38,0.07087059249170125
diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index e05c6e28..190a2d59 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -14,7 +14,7 @@
 For a big model such as a deeplearning model, this might be interesting.
 However, :epkg:`onnxruntime` already parallelize the computation of
 every operator (Gemm, MatMul) using all the CPU it can get so this approach
-should show significant result when used on different processors (CPU, GPU)
+should show significant results when used on different processors (CPU, GPU)
 in parallel.
 
 .. contents::
@@ -101,8 +101,7 @@ def download_file(url, name, min_size):
 # Parallelization
 # +++++++++++++++
 #
-# We define the number of threads as the number of cores divided by 2.
-# This is a dummy value. It should let a core to handle the main program.
+# We define a number of threads lower than the number of cores.
 
 n_threads = min(8, multiprocessing.cpu_count() - 1)
 print(f"n_threads={n_threads}")
@@ -118,15 +117,15 @@ def download_file(url, name, min_size):
 # Let's measure the time for a sequence of images.
 
 
-def sequence(sesss, imgs, N=1):
+def sequence(sess, imgs, N=1):
     res = []
-    for sess, img in zip(sesss, imgs):
+    for img in imgs:
         for i in range(N):
             res.append(sess.run(None, {input_name: img})[0])
     return res
 
 
-print(measure_time(lambda: sequence(sesss, imgs),
+print(measure_time(lambda: sequence(sesss[0], imgs),
                    div_by_number=True, repeat=2, number=2))
 
 #################################
@@ -174,7 +173,7 @@ def parallel(sesss, imgs, N=1):
 for N in tqdm.tqdm(range(1, maxN, 2)):
     begin = time.perf_counter()
     for i in range(rep):
-        res1 = sequence(sesss, imgs, N)
+        res1 = sequence(sesss[0], imgs, N)
     end = (time.perf_counter() - begin) / rep
     obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end)
 
@@ -196,11 +195,19 @@ def parallel(sesss, imgs, N=1):
 
 
 def make_plot(df, title):
-    df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"]
-    df["time_par_img"] = df["time_par"] / df["n_imgs_par"]
 
-    ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot(
-        title=title, logy=True)
+    kwargs = dict(title=title, logy=True)
+    if "time_seq" in df.columns:
+        df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"]
+        df["time_par_img"] = df["time_par"] / df["n_imgs_par"]
+        columns = ["n_imgs_seq", "time_seq_img", "time_par_img"]
+    else:
+        df["time_seq_img_cpu"] = df["time_seq_cpu"] / df["n_imgs_seq_cpu"]
+        df["time_seq_img_gpu"] = df["time_seq_gpu"] / df["n_imgs_seq_gpu"]
+        df["time_par_img"] = df["time_par"] / df["n_imgs_par"]
+        columns = ["n_imgs_seq_cpu", "time_seq_img_cpu", "time_seq_img_gpu", "time_par_img"]
+
+    ax = df[columns].set_index(columns[0]).plot(**kwargs)
     ax.set_xlabel("batch size")
     ax.set_ylabel("s")
     return ax
@@ -209,9 +216,10 @@ def make_plot(df, title):
 make_plot(df, "Time per image / batch size")
 
 #######################################
-# As expected, it does not really improve. It is like parallezing using
-# both strategies, per kernel and per image, both trying to access all
-# the process cores.
+# As expected, it does not improve. It is like parallezing using
+# two strategies, per kernel and per image, both trying to access all
+# the process cores at the same time. The time spent to synchronize
+# is significant.
 
 ###################################################
 # Same with another API based on OrtValue
@@ -264,7 +272,7 @@ def parallel_bind(sesss, imgs, N=1):
 for N in tqdm.tqdm(range(1, maxN, 2)):
     begin = time.perf_counter()
     for i in range(rep):
-        res1 = sequence(sesss, imgs, N)
+        res1 = sequence(sesss[0], imgs, N)
     end = (time.perf_counter() - begin) / rep
     obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end)
 
@@ -291,11 +299,16 @@ def parallel_bind(sesss, imgs, N=1):
 
 make_plot(df, "Time per image / batch size\nrun_with_iobinding")
 
+########################################
+# It leads to the same conclusion. It is no use to parallelize
+# on CPU as onnxruntime is already doing that per kernel.
+
+
 ########################################
 # GPU
 # ===
 #
-# Let's check first it is possible.
+# Let's check first if it is possible.
 
 has_cuda = "CUDAExecutionProvider" in get_all_providers()
 if not has_cuda:
@@ -327,9 +340,15 @@ def parallel_bind(sesss, imgs, N=1):
     for N in tqdm.tqdm(range(1, maxN, 2)):
         begin = time.perf_counter()
         for i in range(rep):
-            res1 = sequence(sesss, imgs, N)
+            res1 = sequence(sesss[0], imgs, N)
+        end = (time.perf_counter() - begin) / rep
+        obs = dict(N=N, n_imgs_seq_cpu=len(res1), time_seq_cpu=end)
+
+        begin = time.perf_counter()
+        for i in range(rep):
+            res2 = sequence(sesss[1], imgs, N)
         end = (time.perf_counter() - begin) / rep
-        obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end)
+        obs.update(dict(n_imgs_seq_gpu=len(res2), time_seq_gpu=end))
 
         begin = time.perf_counter()
         for i in range(rep):
@@ -356,13 +375,14 @@ def parallel_bind(sesss, imgs, N=1):
 ax
 
 ####################################
-# The parallelization on mulitple CPU + GPUs is not really
-# working with this simple setting. GPU should take more
-# images as it is faster.
+# The parallelization on mulitple CPU + GPUs is working, it is faster than CPU
+# but it is still slower than using a single GPU in that case.
 
 #########################################
 # Parallelization on multiple GPUs
 # ++++++++++++++++++++++++++++++++
+#
+# This is the only case for which it should work as every GPU is indenpendent.
 
 if n_gpus > 1:
     n_threads = 2
@@ -381,9 +401,15 @@ def parallel_bind(sesss, imgs, N=1):
     for N in tqdm.tqdm(range(1, maxN, 2)):
         begin = time.perf_counter()
         for i in range(rep):
-            res1 = sequence(sesss, imgs, N)
+            res1 = sequence(sess1, imgs, N)
+        end = (time.perf_counter() - begin) / rep
+        obs = dict(N=N, n_imgs_seq_cpu=len(res1), time_seq_cpu=end)
+
+        begin = time.perf_counter()
+        for i in range(rep):
+            res2 = sequence(sesss[0], imgs, N)
         end = (time.perf_counter() - begin) / rep
-        obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end)
+        obs.update(dict(n_imgs_seq_gpu=len(res2), time_seq_gpu=end))
 
         begin = time.perf_counter()
         for i in range(rep):
@@ -411,7 +437,7 @@ def parallel_bind(sesss, imgs, N=1):
 ax
 
 ####################################
-# The parallelization on mulitple GPUs did work.
+# The parallelization on multiple GPUs did work.
 
-import matplotlib.pyplot as plt
-plt.show()
+# import matplotlib.pyplot as plt
+# plt.show()

From 788dc57f6530ea236be74915282ad0f3a3a89ed0 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Mon, 14 Nov 2022 09:33:09 +0100
Subject: [PATCH 16/16] update

---
 .gitignore                               |  3 +++
 _doc/examples/data/ort_gpus.csv          | 22 +++++++++++-----------
 _doc/examples/plot_parallel_execution.py | 10 +++++++---
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1cc45db6..367ddcc6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,3 +61,6 @@ _unittests/ut_documentation/summary.csv
 _unittests/ut_documentation/_test_example.txt
 _unittests/ut_documentation/_test_example.txt
 something
+_doc/examples/ort_cpu_bind.csv
+_doc/examples/ort_cpu_gpu.csv
+_doc/examples/ort_cpu.csv
diff --git a/_doc/examples/data/ort_gpus.csv b/_doc/examples/data/ort_gpus.csv
index 6d00bee4..1447d1e4 100644
--- a/_doc/examples/data/ort_gpus.csv
+++ b/_doc/examples/data/ort_gpus.csv
@@ -1,11 +1,11 @@
-N,n_imgs_seq,time_seq,n_imgs_par,time_par
-1,2,0.7691331224923488,2,0.0028955735033378005
-3,6,0.010057882987894118,6,0.005550952511839569
-5,10,0.017624731990508735,10,0.009006410502479412
-7,14,0.030596987504395656,14,0.012265623998246156
-9,18,0.0391014114866266,18,0.015631284506525844
-11,22,0.047721832495881245,22,0.0192174395051552
-13,26,0.04471722950984258,26,0.02277063950896263
-15,30,0.06558763500652276,30,0.025715417010360397
-17,34,0.07659106400387827,34,0.029424325504805893
-19,38,0.07083825548761524,38,0.03291438950691372
+index,N,n_imgs_seq_cpu,time_seq_cpu,n_imgs_seq_gpu,time_seq_gpu,n_imgs_par,time_par
+0,1,2,0.009441936999792233,2,0.27651189200696535,2,0.4723829315043986
+1,3,6,0.02211003350384999,6,0.010779099495266564,6,0.006075980491004884
+2,5,10,0.03130707699165214,10,0.018668279502890073,10,0.009614631999284029
+3,7,14,0.050966479000635445,14,0.02530089499487076,14,0.013326078507816419
+4,9,18,0.051026727494900115,18,0.03217840299475938,18,0.016715533987735398
+5,11,22,0.06609680749534164,22,0.03952224800013937,22,0.02055577700957656
+6,13,26,0.07877145400561858,26,0.046802844997728243,26,0.023463245990569703
+7,15,30,0.12377040000865236,30,0.05456937898998149,30,0.027040796499932185
+8,17,34,0.09810231548908632,34,0.06142483800067566,34,0.030876389500917867
+9,19,38,0.12459791499713901,38,0.07050566599355079,38,0.034905767504824325
diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py
index 190a2d59..3a544e83 100644
--- a/_doc/examples/plot_parallel_execution.py
+++ b/_doc/examples/plot_parallel_execution.py
@@ -57,7 +57,10 @@ def download_file(url, name, min_size):
 
 
 small = True
-if small:
+if small == "custom":
+    model_name = "custom.onnx"
+    url_name = None
+elif small:
     model_name = "mobilenetv2-10.onnx"
     url_name = ("https://github.com/onnx/models/raw/main/vision/"
                 "classification/mobilenet/model")
@@ -205,7 +208,8 @@ def make_plot(df, title):
         df["time_seq_img_cpu"] = df["time_seq_cpu"] / df["n_imgs_seq_cpu"]
         df["time_seq_img_gpu"] = df["time_seq_gpu"] / df["n_imgs_seq_gpu"]
         df["time_par_img"] = df["time_par"] / df["n_imgs_par"]
-        columns = ["n_imgs_seq_cpu", "time_seq_img_cpu", "time_seq_img_gpu", "time_par_img"]
+        columns = ["n_imgs_seq_cpu", "time_seq_img_cpu",
+                   "time_seq_img_gpu", "time_par_img"]
 
     ax = df[columns].set_index(columns[0]).plot(**kwargs)
     ax.set_xlabel("batch size")
@@ -313,7 +317,7 @@ def parallel_bind(sesss, imgs, N=1):
 has_cuda = "CUDAExecutionProvider" in get_all_providers()
 if not has_cuda:
     print(f"No CUDA provider was detected in {get_all_providers()}.")
-    
+
 n_gpus = torch.cuda.device_count() if has_cuda else 0
 if n_gpus == 0:
     print("No GPU or one GPU was detected.")