From aa15268e440c60c6d15b30e521b8c19ffa05f88b Mon Sep 17 00:00:00 2001 From: xadupre Date: Fri, 11 Nov 2022 10:47:53 +0100 Subject: [PATCH 01/16] lint + example --- _doc/examples/plot_parallel_execution.py | 62 +++++++++++++++++++ _doc/sphinxdoc/source/conf.py | 1 + .../test_documentation_examples_lightgbm.py | 6 +- 3 files changed, 66 insertions(+), 3 deletions(-) create mode 100644 _doc/examples/plot_parallel_execution.py diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py new file mode 100644 index 00000000..2a2372a0 --- /dev/null +++ b/_doc/examples/plot_parallel_execution.py @@ -0,0 +1,62 @@ +""" +.. _l-plot-parallel-execution: + +Multithreading with onnxruntime +=============================== + +.. index:: thread, parallel + +Python implements multithreading but it is not in practice due to the GIL +(see :epkg:`Le GIL`). However, if most of the parallelized code is not creating +python object, this option becomes more interesting than creating several processes +trying to exchange data through sockets. :epkg:`onnxruntime` falls into that category. +For a big model such as a deeplearning model, this might be interesting. +This example verifies this scenario. + +.. contents:: + :local: + +A model ++++++++ + +Let's retrieve a not so big model. +""" +import os +import urllib.request +import numpy +from onnxruntime import InferenceSession + + +def download_file(url, name, min_size): + if not os.path.exists(name): + print(f"download '{url}'") + with urllib.request.urlopen(url) as u: + content = u.read() + if len(content) < min_size: + raise RuntimeError( + f"Unable to download '{url}' due to\n{content}") + print(f"downloaded {len(content)} bytes.") + with open(name, "wb") as f: + f.write(content) + else: + print(f"'{name}' already downloaded") + + +model_name = "squeezenet1.1-7.onnx" +url_name = ("https://github.com/onnx/models/raw/main/vision/" + "classification/squeezenet/model") +url_name += "/" + model_name +download_file(url_name, model_name, 100000) + +############################################# +# Measuring inference time +# ++++++++++++++++++++++++ +# +# Let's create a random image. + +sess = InferenceSession(model_name) +for i in sess.get_inputs(): + print(f"input {i}, name={i.name!r}, type={i.type}, shape={i.shape}") + + +rnd_img = numpy.random.rand((1, 3, 224, 224)).astype(numpy.float32) diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/sphinxdoc/source/conf.py index e0f978ef..1f1e7268 100644 --- a/_doc/sphinxdoc/source/conf.py +++ b/_doc/sphinxdoc/source/conf.py @@ -129,6 +129,7 @@ def callback_begin(): 'DLPack': 'https://github.com/dmlc/dlpack', 'docker': 'https://en.wikipedia.org/wiki/Docker_(software)', 'DOT': 'https://www.graphviz.org/doc/info/lang.html', + 'Le GIL': 'http://www.xavierdupre.fr/app/teachpyx/helpsphinx/notebooks/gil_example.html', 'ImageNet': 'http://www.image-net.org/', 'LightGBM': 'https://lightgbm.readthedocs.io/en/latest/', 'lightgbm': 'https://lightgbm.readthedocs.io/en/latest/', diff --git a/_unittests/ut_documentation/test_documentation_examples_lightgbm.py b/_unittests/ut_documentation/test_documentation_examples_lightgbm.py index 279b43f0..ebbe7445 100644 --- a/_unittests/ut_documentation/test_documentation_examples_lightgbm.py +++ b/_unittests/ut_documentation/test_documentation_examples_lightgbm.py @@ -2,7 +2,6 @@ @brief test log(time=60s) """ import unittest -from distutils.version import StrictVersion import os import sys import importlib @@ -10,6 +9,7 @@ from datetime import datetime import onnxruntime from pyquickhelper.pycode import ExtTestCase, skipif_appveyor +from pyquickhelper.texthelper.version_helper import compare_module_version def import_source(module_file_path, module_name): @@ -60,8 +60,8 @@ def test_documentation_examples_lightgbm(self): name)) continue if (name == "plot_pipeline_lightgbm.py" and - StrictVersion(onnxruntime.__version__) < - StrictVersion('1.0.0')): + compare_module_version( + onnxruntime.__version__, '1.0.0') < 0): continue if not name.startswith("plot_") or not name.endswith(".py"): continue From 2754bcb048ad09f07ae0bae5e577b4f9df742fc0 Mon Sep 17 00:00:00 2001 From: xadupre Date: Fri, 11 Nov 2022 13:12:41 +0100 Subject: [PATCH 02/16] complete example --- .circleci/config.yml | 2 +- _doc/examples/plot_parallel_execution.py | 146 ++++++++++++++++++++++- onnxcustom/utils/onnx_function.py | 4 +- 3 files changed, 144 insertions(+), 8 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index e44ec247..80b1ecaf 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2,7 +2,7 @@ version: 2 jobs: build: docker: - - image: cimg/python:3.10.5 + - image: cimg/python:3.9.5 working_directory: ~/repo diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index 2a2372a0..9356ff7a 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -21,9 +21,15 @@ Let's retrieve a not so big model. """ +import multiprocessing import os import urllib.request +import threading +import time +import tqdm import numpy +import pandas +from cpyquickhelper.numbers import measure_time from onnxruntime import InferenceSession @@ -42,9 +48,15 @@ def download_file(url, name, min_size): print(f"'{name}' already downloaded") -model_name = "squeezenet1.1-7.onnx" -url_name = ("https://github.com/onnx/models/raw/main/vision/" - "classification/squeezenet/model") +small = True +if small: + model_name = "mobilenetv2-10.onnx" + url_name = ("https://github.com/onnx/models/raw/main/vision/" + "classification/mobilenet/model") +else: + model_name = "resnet18-v1-7.onnx" + url_name = ("https://github.com/onnx/models/raw/main/vision/" + "classification/resnet/model") url_name += "/" + model_name download_file(url_name, model_name, 100000) @@ -54,9 +66,133 @@ def download_file(url, name, min_size): # # Let's create a random image. -sess = InferenceSession(model_name) +sess = InferenceSession(model_name, providers=["CPUExecutionProvider"]) for i in sess.get_inputs(): print(f"input {i}, name={i.name!r}, type={i.type}, shape={i.shape}") + input_name = i.name + input_shape = list(i.shape) + if input_shape[0] in [None, "batch_size"]: + input_shape[0] = 1 -rnd_img = numpy.random.rand((1, 3, 224, 224)).astype(numpy.float32) +rnd_img = numpy.random.rand(*input_shape).astype(numpy.float32) + +res = sess.run(None, {input_name: rnd_img}) +print(f"output: type={res[0].dtype}, shape={res[0].shape}") + +print(measure_time(lambda: sess.run(None, {input_name: rnd_img}), + div_by_number=True, repeat=10, number=10)) + +############################################# +# Parallelization +# +++++++++++++++ +# +# We define the number of threads as the number of cores divided by 2. +# This is a dummy value. It should let a core to handle the main program. + +n_threads = multiprocessing.cpu_count() - 1 +print(f"n_threads={n_threads}") + + +imgs = [numpy.random.rand(*input_shape).astype(numpy.float32) + for i in range(n_threads)] + +sesss = [InferenceSession(model_name, providers=["CPUExecutionProvider"]) + for i in range(n_threads)] + +################################ +# First: sequence + +def sequence(N=1): + res = [] + for sess, img in zip(sesss, imgs): + for i in range(N): + res.append(sess.run(None, {input_name: img})[0]) + return res + +print(measure_time(sequence, div_by_number=True, repeat=2, number=2)) + +################################# +# Second: multitheading + +class MyThread(threading.Thread): + + def __init__(self, sess, imgs): + threading.Thread.__init__(self) + self.sess = sess + self.imgs = imgs + self.q = [] + + def run(self): + for img in self.imgs: + r = sess.run(None, {input_name: img})[0] + self.q.append(r) + + +def parallel(N=1): + threads = [MyThread(sess, [img] * N) + for sess, img in zip(sesss, imgs)] + for t in threads: + t.start() + res = [] + for t in threads: + t.join() + res.extend(t.q) + return res + +print(measure_time(parallel, div_by_number=True, repeat=2, number=2)) + +################################### +# It is worse for one image. Let's increase the number of images to parallelize. + +r_seq = sequence(4) +if len(r_seq) != n_threads * 4: + raise ValueError( + f"Unexpected number of results {len(r_seq)} != {n_threads * 4}.") +r_par = parallel(4) +if len(r_par) != n_threads * 4: + raise ValueError( + f"Unexpected number of results {len(r_par)} != {n_threads * 4}.") + +print(measure_time(lambda: sequence(4), div_by_number=True, repeat=2, number=2)) +print(measure_time(lambda: parallel(4), div_by_number=True, repeat=2, number=2)) + +#################################### +# Let's increase again. + +data = [] +for N in tqdm.tqdm(range(1, 21)): + begin = time.perf_counter() + res1 = sequence(N) + end = time.perf_counter() - begin + obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end) + + begin = time.perf_counter() + res2 = parallel(N) + end = time.perf_counter() - begin + obs.update(dict(n_imgs_par=len(res2), time_par=end)) + + data.append(obs) + +df = pandas.DataFrame(data) +df + +########################################## +# Plotting +# ++++++++ + +df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"] +df["time_par_img"] = df["time_par"] / df["n_imgs_par"] + +ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot( + title="Time per image / batch size") +ax.set_xlabel("batch size") +ax.set_ylabel("s") + +####################################### +# It improves but not as much as expected. The number of interactions +# with python is still too high. + +# import matplotlib.pyplot as plt +# plt.show() + diff --git a/onnxcustom/utils/onnx_function.py b/onnxcustom/utils/onnx_function.py index 825b39be..5ae13fcd 100644 --- a/onnxcustom/utils/onnx_function.py +++ b/onnxcustom/utils/onnx_function.py @@ -760,7 +760,7 @@ def _onnx_grad_sigmoid_neg_log_loss_error(target_opset=None, print("DOT-SECTION", oinf.to_dot()) """ - from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE + from onnx.helper import np_dtype_to_tensor_dtype from skl2onnx.algebra.onnx_ops import ( OnnxSub, OnnxMul, OnnxSigmoid, OnnxLog, OnnxNeg, OnnxReduceSum, OnnxReshape, OnnxAdd, OnnxCast, OnnxClip) @@ -771,7 +771,7 @@ def _onnx_grad_sigmoid_neg_log_loss_error(target_opset=None, op_version=target_opset) p0 = OnnxSub(numpy.array([1], dtype=dtype), p1, op_version=target_opset) - y1 = OnnxCast('X1', to=NP_TYPE_TO_TENSOR_TYPE[numpy.dtype(dtype)], + y1 = OnnxCast('X1', to=np_dtype_to_tensor_dtype(numpy.dtype(dtype)), op_version=target_opset) y0 = OnnxSub(numpy.array([1], dtype=dtype), y1, op_version=target_opset) From 5ad7d877461ea165d79dd984ca39899bf0a128bd Mon Sep 17 00:00:00 2001 From: xadupre Date: Fri, 11 Nov 2022 13:13:05 +0100 Subject: [PATCH 03/16] Update plot_parallel_execution.py --- _doc/examples/plot_parallel_execution.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index 9356ff7a..dafdba96 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -191,7 +191,8 @@ def parallel(N=1): ####################################### # It improves but not as much as expected. The number of interactions -# with python is still too high. +# with python is still too high. The bigger the model is, the better it +# should be. # import matplotlib.pyplot as plt # plt.show() From bb51ea32717fab340dd33f42bdfb77b8fc4e395f Mon Sep 17 00:00:00 2001 From: xadupre Date: Fri, 11 Nov 2022 13:23:23 +0100 Subject: [PATCH 04/16] import error --- onnxcustom/utils/onnx_function.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/onnxcustom/utils/onnx_function.py b/onnxcustom/utils/onnx_function.py index 5ae13fcd..93137f11 100644 --- a/onnxcustom/utils/onnx_function.py +++ b/onnxcustom/utils/onnx_function.py @@ -760,7 +760,14 @@ def _onnx_grad_sigmoid_neg_log_loss_error(target_opset=None, print("DOT-SECTION", oinf.to_dot()) """ - from onnx.helper import np_dtype_to_tensor_dtype + try: + from onnx.helper import np_dtype_to_tensor_dtype + except ImportError: + from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE + + def np_dtype_to_tensor_dtype(dtype): + return NP_TYPE_TO_TENSOR_TYPE[dtype] + from skl2onnx.algebra.onnx_ops import ( OnnxSub, OnnxMul, OnnxSigmoid, OnnxLog, OnnxNeg, OnnxReduceSum, OnnxReshape, OnnxAdd, OnnxCast, OnnxClip) From 53d9c15b04bb674c8a82d079920bc35abd609c9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Fri, 11 Nov 2022 14:11:58 +0100 Subject: [PATCH 05/16] lint --- onnxcustom/utils/onnx_function.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxcustom/utils/onnx_function.py b/onnxcustom/utils/onnx_function.py index 93137f11..ffa4fd52 100644 --- a/onnxcustom/utils/onnx_function.py +++ b/onnxcustom/utils/onnx_function.py @@ -764,10 +764,10 @@ def _onnx_grad_sigmoid_neg_log_loss_error(target_opset=None, from onnx.helper import np_dtype_to_tensor_dtype except ImportError: from onnx.mapping import NP_TYPE_TO_TENSOR_TYPE - + def np_dtype_to_tensor_dtype(dtype): return NP_TYPE_TO_TENSOR_TYPE[dtype] - + from skl2onnx.algebra.onnx_ops import ( OnnxSub, OnnxMul, OnnxSigmoid, OnnxLog, OnnxNeg, OnnxReduceSum, OnnxReshape, OnnxAdd, OnnxCast, OnnxClip) From a5fe43b83e479a97e90a430999c685ff5b2b941d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Fri, 11 Nov 2022 14:40:24 +0100 Subject: [PATCH 06/16] lint --- _doc/examples/plot_parallel_execution.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index dafdba96..95025b23 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -103,6 +103,7 @@ def download_file(url, name, min_size): ################################ # First: sequence + def sequence(N=1): res = [] for sess, img in zip(sesss, imgs): @@ -110,11 +111,13 @@ def sequence(N=1): res.append(sess.run(None, {input_name: img})[0]) return res + print(measure_time(sequence, div_by_number=True, repeat=2, number=2)) ################################# # Second: multitheading + class MyThread(threading.Thread): def __init__(self, sess, imgs): @@ -140,6 +143,7 @@ def parallel(N=1): res.extend(t.q) return res + print(measure_time(parallel, div_by_number=True, repeat=2, number=2)) ################################### @@ -149,7 +153,7 @@ def parallel(N=1): if len(r_seq) != n_threads * 4: raise ValueError( f"Unexpected number of results {len(r_seq)} != {n_threads * 4}.") -r_par = parallel(4) +r_par = parallel(4) if len(r_par) != n_threads * 4: raise ValueError( f"Unexpected number of results {len(r_par)} != {n_threads * 4}.") @@ -196,4 +200,3 @@ def parallel(N=1): # import matplotlib.pyplot as plt # plt.show() - From 0f5c76328241fe655bde4e12d09dbece678ac5b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Fri, 11 Nov 2022 19:42:26 +0100 Subject: [PATCH 07/16] Update plot_parallel_execution.py --- _doc/examples/plot_parallel_execution.py | 112 ++++++++++++++++++++--- 1 file changed, 98 insertions(+), 14 deletions(-) diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index 95025b23..aca387c5 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -31,6 +31,8 @@ import pandas from cpyquickhelper.numbers import measure_time from onnxruntime import InferenceSession +from onnxruntime.capi._pybind_state import ( # pylint: disable=E0611 + SessionIOBinding, OrtDevice as C_OrtDevice) def download_file(url, name, min_size): @@ -66,21 +68,24 @@ def download_file(url, name, min_size): # # Let's create a random image. -sess = InferenceSession(model_name, providers=["CPUExecutionProvider"]) -for i in sess.get_inputs(): +sess1 = InferenceSession(model_name, providers=["CPUExecutionProvider"]) +for i in sess1.get_inputs(): print(f"input {i}, name={i.name!r}, type={i.type}, shape={i.shape}") input_name = i.name input_shape = list(i.shape) - if input_shape[0] in [None, "batch_size"]: + if input_shape[0] in [None, "batch_size", "N"]: input_shape[0] = 1 +for i in sess1.get_outputs(): + print(f"output {i}, name={i.name!r}, type={i.type}, shape={i.shape}") + output_name = i.name rnd_img = numpy.random.rand(*input_shape).astype(numpy.float32) -res = sess.run(None, {input_name: rnd_img}) +res = sess1.run(None, {input_name: rnd_img}) print(f"output: type={res[0].dtype}, shape={res[0].shape}") -print(measure_time(lambda: sess.run(None, {input_name: rnd_img}), +print(measure_time(lambda: sess1.run(None, {input_name: rnd_img}), div_by_number=True, repeat=10, number=10)) ############################################# @@ -128,7 +133,7 @@ def __init__(self, sess, imgs): def run(self): for img in self.imgs: - r = sess.run(None, {input_name: img})[0] + r = self.sess.run(None, {input_name: img})[0] self.q.append(r) @@ -165,15 +170,19 @@ def parallel(N=1): # Let's increase again. data = [] -for N in tqdm.tqdm(range(1, 21)): +rep = 2 +maxN = 18 +for N in tqdm.tqdm(range(1, maxN, 2)): begin = time.perf_counter() - res1 = sequence(N) - end = time.perf_counter() - begin + for i in range(rep): + res1 = sequence(N) + end = (time.perf_counter() - begin) / rep obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end) begin = time.perf_counter() - res2 = parallel(N) - end = time.perf_counter() - begin + for i in range(rep): + res2 = parallel(N) + end = (time.perf_counter() - begin) / rep obs.update(dict(n_imgs_par=len(res2), time_par=end)) data.append(obs) @@ -194,9 +203,84 @@ def parallel(N=1): ax.set_ylabel("s") ####################################### -# It improves but not as much as expected. The number of interactions +# It does not really improve. The number of interactions # with python is still too high. The bigger the model is, the better it # should be. -# import matplotlib.pyplot as plt -# plt.show() +################################################### +# With another API +# ++++++++++++++++ + + +class MyThreadBind(threading.Thread): + + def __init__(self, sess, imgs): + threading.Thread.__init__(self) + self.sess = sess + self.imgs = imgs + self.q = [] + self.bind = SessionIOBinding(self.sess._sess) + self.ort_device = C_OrtDevice( + C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0) + + def run(self): + bind = self.bind + ort_device = self.ort_device + bind.bind_output(output_name, ort_device) + sess = self.sess._sess + q = self.q + for img in self.imgs: + bind.bind_input(input_name, ort_device, + img.dtype, img.shape, + img.__array_interface__['data'][0]) + sess.run_with_iobinding(bind, None) + ortvalues = bind.get_outputs() + q.append(ortvalues) + + +def parallel_bind(N=1): + threads = [MyThreadBind(sess, [img] * N) + for sess, img in zip(sesss, imgs)] + for t in threads: + t.start() + res = [] + for t in threads: + t.join() + res.extend(t.q) + return res + + +data = [] +for N in tqdm.tqdm(range(1, maxN, 2)): + begin = time.perf_counter() + for i in range(rep): + res1 = sequence(N) + end = (time.perf_counter() - begin) / rep + obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end) + + begin = time.perf_counter() + for i in range(rep): + res2 = parallel_bind(N) + end = (time.perf_counter() - begin) / rep + obs.update(dict(n_imgs_par=len(res2), time_par=end)) + + data.append(obs) + +df = pandas.DataFrame(data) +df + +############################ +# Plots +# +++++ + + +df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"] +df["time_par_img"] = df["time_par"] / df["n_imgs_par"] + +ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot( + title="Time per image / batch size\nrun_with_iobinding") +ax.set_xlabel("batch size") +ax.set_ylabel("s") + +import matplotlib.pyplot as plt +plt.show() From 3f1001b4da191ca42b9f9732050a55815bf10faa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Fri, 11 Nov 2022 19:42:50 +0100 Subject: [PATCH 08/16] Update plot_parallel_execution.py --- _doc/examples/plot_parallel_execution.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index aca387c5..a85d1249 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -232,7 +232,7 @@ def run(self): for img in self.imgs: bind.bind_input(input_name, ort_device, img.dtype, img.shape, - img.__array_interface__['data'][0]) + img.__array_interface__['data'][0]) sess.run_with_iobinding(bind, None) ortvalues = bind.get_outputs() q.append(ortvalues) @@ -282,5 +282,5 @@ def parallel_bind(N=1): ax.set_xlabel("batch size") ax.set_ylabel("s") -import matplotlib.pyplot as plt -plt.show() +# import matplotlib.pyplot as plt +# plt.show() From c1ba594c61d756a38fa2cd30248be9e9e4573599 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?xavier=20dupr=C3=A9?= Date: Fri, 11 Nov 2022 19:59:06 +0100 Subject: [PATCH 09/16] Update plot_parallel_execution.py --- _doc/examples/plot_parallel_execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index a85d1249..688b92c4 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -232,7 +232,7 @@ def run(self): for img in self.imgs: bind.bind_input(input_name, ort_device, img.dtype, img.shape, - img.__array_interface__['data'][0]) + img.__array_interface__['data'][0]) sess.run_with_iobinding(bind, None) ortvalues = bind.get_outputs() q.append(ortvalues) From c98f7942e27c0043eee28cca2162775111583fc6 Mon Sep 17 00:00:00 2001 From: xadupre Date: Sat, 12 Nov 2022 12:41:29 +0100 Subject: [PATCH 10/16] documentation and examples --- _doc/examples/plot_parallel_execution.py | 223 +++++++++++++----- .../source/api/onnxruntime_python/index.rst | 2 +- .../tutorial_onnxruntime/ortvalue_doc.rst | 2 + 3 files changed, 169 insertions(+), 58 deletions(-) diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index 688b92c4..fd07ec6a 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -1,6 +1,7 @@ """ .. _l-plot-parallel-execution: +=============================== Multithreading with onnxruntime =============================== @@ -11,13 +12,16 @@ python object, this option becomes more interesting than creating several processes trying to exchange data through sockets. :epkg:`onnxruntime` falls into that category. For a big model such as a deeplearning model, this might be interesting. -This example verifies this scenario. +However, :epkg:`onnxruntime` already parallelize the computation of +every operator (Gemm, MatMul) using all the CPU it can get so this approach +should show significant result when used on different processors (CPU, GPU) +in parallel. .. contents:: :local: A model -+++++++ +======= Let's retrieve a not so big model. """ @@ -30,7 +34,8 @@ import numpy import pandas from cpyquickhelper.numbers import measure_time -from onnxruntime import InferenceSession +import torch.cuda +from onnxruntime import InferenceSession, get_all_providers from onnxruntime.capi._pybind_state import ( # pylint: disable=E0611 SessionIOBinding, OrtDevice as C_OrtDevice) @@ -63,8 +68,11 @@ def download_file(url, name, min_size): download_file(url_name, model_name, 100000) ############################################# -# Measuring inference time -# ++++++++++++++++++++++++ +# Measuring inference time when parallelizing on CPU +# ================================================== +# +# Sequence +# ++++++++ # # Let's create a random image. @@ -106,10 +114,10 @@ def download_file(url, name, min_size): for i in range(n_threads)] ################################ -# First: sequence +# Let's measure the time for a sequence of images. -def sequence(N=1): +def sequence(sesss, imgs, N=1): res = [] for sess, img in zip(sesss, imgs): for i in range(N): @@ -117,10 +125,11 @@ def sequence(N=1): return res -print(measure_time(sequence, div_by_number=True, repeat=2, number=2)) +print(measure_time(lambda: sequence(sesss, imgs), + div_by_number=True, repeat=2, number=2)) ################################# -# Second: multitheading +# And then with multithreading. class MyThread(threading.Thread): @@ -137,7 +146,7 @@ def run(self): self.q.append(r) -def parallel(N=1): +def parallel(sesss, imgs, N=1): threads = [MyThread(sess, [img] * N) for sess, img in zip(sesss, imgs)] for t in threads: @@ -149,25 +158,12 @@ def parallel(N=1): return res -print(measure_time(parallel, div_by_number=True, repeat=2, number=2)) +print(measure_time(lambda: parallel(sesss, imgs), + div_by_number=True, repeat=2, number=2)) ################################### -# It is worse for one image. Let's increase the number of images to parallelize. - -r_seq = sequence(4) -if len(r_seq) != n_threads * 4: - raise ValueError( - f"Unexpected number of results {len(r_seq)} != {n_threads * 4}.") -r_par = parallel(4) -if len(r_par) != n_threads * 4: - raise ValueError( - f"Unexpected number of results {len(r_par)} != {n_threads * 4}.") - -print(measure_time(lambda: sequence(4), div_by_number=True, repeat=2, number=2)) -print(measure_time(lambda: parallel(4), div_by_number=True, repeat=2, number=2)) - -#################################### -# Let's increase again. +# It is worse for one image. It is expected as mentioned in the introduction. +# Let's check for different number of images to parallelize. data = [] rep = 2 @@ -175,13 +171,13 @@ def parallel(N=1): for N in tqdm.tqdm(range(1, maxN, 2)): begin = time.perf_counter() for i in range(rep): - res1 = sequence(N) + res1 = sequence(sesss, imgs, N) end = (time.perf_counter() - begin) / rep obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end) begin = time.perf_counter() for i in range(rep): - res2 = parallel(N) + res2 = parallel(sesss, imgs, N) end = (time.perf_counter() - begin) / rep obs.update(dict(n_imgs_par=len(res2), time_par=end)) @@ -191,37 +187,44 @@ def parallel(N=1): df ########################################## -# Plotting -# ++++++++ +# Plots +# +++++ -df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"] -df["time_par_img"] = df["time_par"] / df["n_imgs_par"] -ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot( - title="Time per image / batch size") -ax.set_xlabel("batch size") -ax.set_ylabel("s") +def make_plot(df, title): + df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"] + df["time_par_img"] = df["time_par"] / df["n_imgs_par"] + + ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot( + title=title) + ax.set_xlabel("batch size") + ax.set_ylabel("s") + return ax + + +make_plot(df, "Time per image / batch size") ####################################### -# It does not really improve. The number of interactions -# with python is still too high. The bigger the model is, the better it -# should be. +# As expected, it does not really improve. It is like parallezing using +# both strategies, per kernel and per image, both trying to access all +# the process cores. ################################################### -# With another API -# ++++++++++++++++ +# Same with another API based on OrtValue +# +++++++++++++++++++++++++++++++++++++++ +# +# See :epkg:`l-ortvalue-doc`. class MyThreadBind(threading.Thread): - def __init__(self, sess, imgs): + def __init__(self, sess, imgs, ort_device): threading.Thread.__init__(self) self.sess = sess self.imgs = imgs self.q = [] self.bind = SessionIOBinding(self.sess._sess) - self.ort_device = C_OrtDevice( - C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0) + self.ort_device = ort_device def run(self): bind = self.bind @@ -238,8 +241,10 @@ def run(self): q.append(ortvalues) -def parallel_bind(N=1): - threads = [MyThreadBind(sess, [img] * N) +def parallel_bind(sesss, imgs, N=1): + ort_device = C_OrtDevice( + C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0) + threads = [MyThreadBind(sess, [img] * N, ort_device) for sess, img in zip(sesss, imgs)] for t in threads: t.start() @@ -254,13 +259,13 @@ def parallel_bind(N=1): for N in tqdm.tqdm(range(1, maxN, 2)): begin = time.perf_counter() for i in range(rep): - res1 = sequence(N) + res1 = sequence(sesss, imgs, N) end = (time.perf_counter() - begin) / rep obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end) begin = time.perf_counter() for i in range(rep): - res2 = parallel_bind(N) + res2 = parallel_bind(sesss, imgs, N) end = (time.perf_counter() - begin) / rep obs.update(dict(n_imgs_par=len(res2), time_par=end)) @@ -270,17 +275,121 @@ def parallel_bind(N=1): df ############################ -# Plots -# +++++ +# Plots. + +make_plot(df, "Time per image / batch size\nrun_with_iobinding") + +######################################## +# GPU +# === +# +# Let's check first it is possible. + +has_cuda = "CUDAExecutionProvider" in get_all_providers() +if not has_cuda: + print(f"No CUDA provider was detected in {get_all_providers()}.") + +######################################### +# Parallelization GPU + CPU +# +++++++++++++++++++++++++ + +if has_cuda: + n_threads = 2 + sesss = [InferenceSession(model_name, providers=["CPUExecutionProvider"]), + InferenceSession(model_name, providers=["CUDAExecutionProvider", + "CPUExecutionProvider"])] + imgs = [numpy.random.rand(*input_shape).astype(numpy.float32) + for i in range(n_threads)] + + data = [] + for N in tqdm.tqdm(range(1, maxN, 2)): + begin = time.perf_counter() + for i in range(rep): + res1 = sequence(sesss, imgs, N) + end = (time.perf_counter() - begin) / rep + obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end) + + begin = time.perf_counter() + for i in range(rep): + res2 = parallel_bind(sesss, imgs, N) + end = (time.perf_counter() - begin) / rep + obs.update(dict(n_imgs_par=len(res2), time_par=end)) + + data.append(obs) + + df = pandas.DataFrame(data) + df.to_csv("ort_cpu_gpu.csv", index=False) +else: + df = None + +df + +#################################### +# Plots. + +if has_cuda: + ax = make_plot(df, "Time per image / batch size\nCPU + GPU") +else: + ax = None + +ax + +######################################### +# Parallelization on multiple GPUs +# ++++++++++++++++++++++++++++++++ +n_gpus = torch.cuda.device_count() if has_cuda else 0 +if n_gpus == 0: + print("No GPU or one GPU was detected.") +elif n_gpus == 1: + print("1 GPU was detected.") +else: + print(f"{n_gpus} were detected.") + + +if n_gpus > 1: + n_threads = 2 + sesss = [InferenceSession(model_name, providers=["CUDAExecutionProvider", + "CPUExecutionProvider"], + provider_options={"device_id": i}) + for i in range(n_gpus)] + imgs = [numpy.random.rand(*input_shape).astype(numpy.float32) + for i in range(n_threads)] + + data = [] + for N in tqdm.tqdm(range(1, maxN, 2)): + begin = time.perf_counter() + for i in range(rep): + res1 = sequence(sesss, imgs, N) + end = (time.perf_counter() - begin) / rep + obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end) + + begin = time.perf_counter() + for i in range(rep): + res2 = parallel_bind(sesss, imgs, N) + end = (time.perf_counter() - begin) / rep + obs.update(dict(n_imgs_par=len(res2), time_par=end)) + + data.append(obs) + + df = pandas.DataFrame(data) + df.to_csv("ort_gpus.csv", index=False) +else: + df = None + +df + + +#################################### +# Plots. + +if n_gpus > 1: + ax = make_plot(df, "Time per image / batch size\nCPU + GPU") +else: + ax = None -df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"] -df["time_par_img"] = df["time_par"] / df["n_imgs_par"] +ax -ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot( - title="Time per image / batch size\nrun_with_iobinding") -ax.set_xlabel("batch size") -ax.set_ylabel("s") # import matplotlib.pyplot as plt # plt.show() diff --git a/_doc/sphinxdoc/source/api/onnxruntime_python/index.rst b/_doc/sphinxdoc/source/api/onnxruntime_python/index.rst index 45e6dcbf..e8ade2c9 100644 --- a/_doc/sphinxdoc/source/api/onnxruntime_python/index.rst +++ b/_doc/sphinxdoc/source/api/onnxruntime_python/index.rst @@ -4,7 +4,7 @@ Summary of onnxruntime and onnxruntime-training API Module :epkg:`onnxcustom` leverages :epkg:`onnxruntime-training` to train models. Next sections exposes frequent functions uses to run inference -and training with :epkg:`onnxruntime` and :epkg:`onnxruntume-training`. +and training with :epkg:`onnxruntime` and :epkg:`onnxruntime-training`. Most of the code in :epkg:`onnxruntime` is written in C++ and exposed in Python using :epkg:`pybind11`. For inference, the main class diff --git a/_doc/sphinxdoc/source/tutorials/tutorial_onnxruntime/ortvalue_doc.rst b/_doc/sphinxdoc/source/tutorials/tutorial_onnxruntime/ortvalue_doc.rst index 8cca3286..65a1a6db 100644 --- a/_doc/sphinxdoc/source/tutorials/tutorial_onnxruntime/ortvalue_doc.rst +++ b/_doc/sphinxdoc/source/tutorials/tutorial_onnxruntime/ortvalue_doc.rst @@ -1,4 +1,6 @@ +.. _l-ortvalue-doc: + ======== OrtValue ======== From d4346349f3ae2e50e960bd3916588404b0c20aff Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 13 Nov 2022 08:50:08 +0000 Subject: [PATCH 11/16] fix example Signed-off-by: Xavier Dupre --- _doc/examples/data/ort_cpu_gpu.csv | 11 +++++ _doc/examples/data/ort_gpus.csv | 11 +++++ _doc/examples/plot_parallel_execution.py | 51 +++++++++++++++++++----- 3 files changed, 64 insertions(+), 9 deletions(-) create mode 100644 _doc/examples/data/ort_cpu_gpu.csv create mode 100644 _doc/examples/data/ort_gpus.csv diff --git a/_doc/examples/data/ort_cpu_gpu.csv b/_doc/examples/data/ort_cpu_gpu.csv new file mode 100644 index 00000000..826d8139 --- /dev/null +++ b/_doc/examples/data/ort_cpu_gpu.csv @@ -0,0 +1,11 @@ +N,n_imgs_seq,time_seq,n_imgs_par,time_par +1,2,0.5718123729893705,2,0.007878101489041 +3,6,0.016015594999771565,6,0.015513360500335693 +5,10,0.024301433499203995,10,0.023472609493182972 +7,14,0.03465705699636601,14,0.03860543250630144 +9,18,0.04403566100518219,18,0.04680142400320619 +11,22,0.06553528150834609,22,0.05793282100057695 +13,26,0.08153052550915163,26,0.05927092849742621 +15,30,0.0768452735064784,30,0.08342061600706074 +17,34,0.09201569500146434,34,0.08902498900715727 +19,38,0.11011747350858059,38,0.09492045298975427 diff --git a/_doc/examples/data/ort_gpus.csv b/_doc/examples/data/ort_gpus.csv new file mode 100644 index 00000000..30f083c4 --- /dev/null +++ b/_doc/examples/data/ort_gpus.csv @@ -0,0 +1,11 @@ +N,n_imgs_seq,time_seq,n_imgs_par,time_par +1,2,0.7859347729972797,2,0.0027880600100615993 +3,6,0.01325492349860724,6,0.005836712007294409 +5,10,0.02211888850433752,10,0.008998960503959097 +7,14,0.027732157497666776,14,0.012500747499871068 +9,18,0.03999921299691778,18,0.015629247500328347 +11,22,0.048589186000754125,22,0.019298979008453898 +13,26,0.05781353948987089,26,0.022477426507975906 +15,30,0.0516846864920808,30,0.02620260650292039 +17,34,0.076614134493866,34,0.02977054200891871 +19,38,0.08429743749729823,38,0.03443809199961834 diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index fd07ec6a..f0bc2ef0 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -25,6 +25,7 @@ Let's retrieve a not so big model. """ +import gc import multiprocessing import os import urllib.request @@ -103,7 +104,7 @@ def download_file(url, name, min_size): # We define the number of threads as the number of cores divided by 2. # This is a dummy value. It should let a core to handle the main program. -n_threads = multiprocessing.cpu_count() - 1 +n_threads = min(8, multiprocessing.cpu_count() - 1) print(f"n_threads={n_threads}") @@ -161,13 +162,15 @@ def parallel(sesss, imgs, N=1): print(measure_time(lambda: parallel(sesss, imgs), div_by_number=True, repeat=2, number=2)) + ################################### # It is worse for one image. It is expected as mentioned in the introduction. # Let's check for different number of images to parallelize. +print("ORT // CPU") data = [] rep = 2 -maxN = 18 +maxN = 21 for N in tqdm.tqdm(range(1, maxN, 2)): begin = time.perf_counter() for i in range(rep): @@ -184,8 +187,15 @@ def parallel(sesss, imgs, N=1): data.append(obs) df = pandas.DataFrame(data) +df.to_csv("ort_cpu.csv", index=False) df +##################################### +# Let's free the memory. + +del sesss[:] +gc.collect() + ########################################## # Plots # +++++ @@ -255,6 +265,7 @@ def parallel_bind(sesss, imgs, N=1): return res +print("ORT (bind) // CPU") data = [] for N in tqdm.tqdm(range(1, maxN, 2)): begin = time.perf_counter() @@ -271,9 +282,13 @@ def parallel_bind(sesss, imgs, N=1): data.append(obs) +del sesss[:] +gc.collect() df = pandas.DataFrame(data) +df.to_csv("ort_cpu_bind.csv", index=False) df + ############################ # Plots. @@ -301,6 +316,7 @@ def parallel_bind(sesss, imgs, N=1): imgs = [numpy.random.rand(*input_shape).astype(numpy.float32) for i in range(n_threads)] + print("ORT // CPU + GPU") data = [] for N in tqdm.tqdm(range(1, maxN, 2)): begin = time.perf_counter() @@ -317,10 +333,13 @@ def parallel_bind(sesss, imgs, N=1): data.append(obs) + del sesss[:] + gc.collect() df = pandas.DataFrame(data) df.to_csv("ort_cpu_gpu.csv", index=False) else: - df = None + print("No GPU is available but it should show something like the following.") + df = pandas.read_csv("data/ort_cpu_gpu.csv") df @@ -334,6 +353,11 @@ def parallel_bind(sesss, imgs, N=1): ax +#################################### +# The parallelization on mulitple CPU + GPUs is not really +# working with this simple setting. GPU should take more +# images as it is faster. + ######################################### # Parallelization on multiple GPUs # ++++++++++++++++++++++++++++++++ @@ -344,18 +368,22 @@ def parallel_bind(sesss, imgs, N=1): elif n_gpus == 1: print("1 GPU was detected.") else: - print(f"{n_gpus} were detected.") + print(f"{n_gpus} GPUs were detected.") if n_gpus > 1: n_threads = 2 - sesss = [InferenceSession(model_name, providers=["CUDAExecutionProvider", - "CPUExecutionProvider"], - provider_options={"device_id": i}) - for i in range(n_gpus)] + sesss = [] + for i in range(n_gpus): + print(f"Initialize device {i}") + sesss.append( + InferenceSession(model_name, providers=["CUDAExecutionProvider", + "CPUExecutionProvider"], + provider_options=[{"device_id": i}, {}])) imgs = [numpy.random.rand(*input_shape).astype(numpy.float32) for i in range(n_threads)] + print("ORT // GPUs") data = [] for N in tqdm.tqdm(range(1, maxN, 2)): begin = time.perf_counter() @@ -372,10 +400,13 @@ def parallel_bind(sesss, imgs, N=1): data.append(obs) + del sesss[:] + gc.collect() df = pandas.DataFrame(data) df.to_csv("ort_gpus.csv", index=False) else: - df = None + print("No GPU is available but it should show something like the following.") + df = pandas.read_csv("data/ort_gpus.csv") df @@ -390,6 +421,8 @@ def parallel_bind(sesss, imgs, N=1): ax +#################################### +# The parallelization on mulitple GPUs did work. # import matplotlib.pyplot as plt # plt.show() From 7b44628015f211a0d689560947dcd7483891c169 Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Sun, 13 Nov 2022 08:54:47 +0000 Subject: [PATCH 12/16] update --- _doc/examples/data/ort_cpu_gpu.csv | 20 ++++++++++---------- _doc/examples/data/ort_gpus.csv | 20 ++++++++++---------- _doc/examples/plot_parallel_execution.py | 8 ++++---- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/_doc/examples/data/ort_cpu_gpu.csv b/_doc/examples/data/ort_cpu_gpu.csv index 826d8139..97510823 100644 --- a/_doc/examples/data/ort_cpu_gpu.csv +++ b/_doc/examples/data/ort_cpu_gpu.csv @@ -1,11 +1,11 @@ N,n_imgs_seq,time_seq,n_imgs_par,time_par -1,2,0.5718123729893705,2,0.007878101489041 -3,6,0.016015594999771565,6,0.015513360500335693 -5,10,0.024301433499203995,10,0.023472609493182972 -7,14,0.03465705699636601,14,0.03860543250630144 -9,18,0.04403566100518219,18,0.04680142400320619 -11,22,0.06553528150834609,22,0.05793282100057695 -13,26,0.08153052550915163,26,0.05927092849742621 -15,30,0.0768452735064784,30,0.08342061600706074 -17,34,0.09201569500146434,34,0.08902498900715727 -19,38,0.11011747350858059,38,0.09492045298975427 +1,2,0.5849514909932623,2,0.02292487349768635 +3,6,0.01547511100943666,6,0.01833898900076747 +5,10,0.036359535486553796,10,0.0324736335023772 +7,14,0.04304601749754511,14,0.039623103497433476 +9,18,0.06215578749834094,18,0.04699261850328185 +11,22,0.0860430364991771,22,0.05861812600051053 +13,26,0.0699024919886142,26,0.06959964999987278 +15,30,0.09830122849962208,30,0.08662367198849097 +17,34,0.11123420301009901,34,0.10407247900729999 +19,38,0.10265450000588316,38,0.09627137100324035 diff --git a/_doc/examples/data/ort_gpus.csv b/_doc/examples/data/ort_gpus.csv index 30f083c4..6d00bee4 100644 --- a/_doc/examples/data/ort_gpus.csv +++ b/_doc/examples/data/ort_gpus.csv @@ -1,11 +1,11 @@ N,n_imgs_seq,time_seq,n_imgs_par,time_par -1,2,0.7859347729972797,2,0.0027880600100615993 -3,6,0.01325492349860724,6,0.005836712007294409 -5,10,0.02211888850433752,10,0.008998960503959097 -7,14,0.027732157497666776,14,0.012500747499871068 -9,18,0.03999921299691778,18,0.015629247500328347 -11,22,0.048589186000754125,22,0.019298979008453898 -13,26,0.05781353948987089,26,0.022477426507975906 -15,30,0.0516846864920808,30,0.02620260650292039 -17,34,0.076614134493866,34,0.02977054200891871 -19,38,0.08429743749729823,38,0.03443809199961834 +1,2,0.7691331224923488,2,0.0028955735033378005 +3,6,0.010057882987894118,6,0.005550952511839569 +5,10,0.017624731990508735,10,0.009006410502479412 +7,14,0.030596987504395656,14,0.012265623998246156 +9,18,0.0391014114866266,18,0.015631284506525844 +11,22,0.047721832495881245,22,0.0192174395051552 +13,26,0.04471722950984258,26,0.02277063950896263 +15,30,0.06558763500652276,30,0.025715417010360397 +17,34,0.07659106400387827,34,0.029424325504805893 +19,38,0.07083825548761524,38,0.03291438950691372 diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index f0bc2ef0..e410bd4a 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -187,7 +187,7 @@ def parallel(sesss, imgs, N=1): data.append(obs) df = pandas.DataFrame(data) -df.to_csv("ort_cpu.csv", index=False) +df.reset_index(drop=False).to_csv("ort_cpu.csv", index=False) df ##################################### @@ -285,7 +285,7 @@ def parallel_bind(sesss, imgs, N=1): del sesss[:] gc.collect() df = pandas.DataFrame(data) -df.to_csv("ort_cpu_bind.csv", index=False) +df.reset_index(drop=False).to_csv("ort_cpu_bind.csv", index=False) df @@ -336,7 +336,7 @@ def parallel_bind(sesss, imgs, N=1): del sesss[:] gc.collect() df = pandas.DataFrame(data) - df.to_csv("ort_cpu_gpu.csv", index=False) + df.reset_index(drop=False).to_csv("ort_cpu_gpu.csv", index=False) else: print("No GPU is available but it should show something like the following.") df = pandas.read_csv("data/ort_cpu_gpu.csv") @@ -403,7 +403,7 @@ def parallel_bind(sesss, imgs, N=1): del sesss[:] gc.collect() df = pandas.DataFrame(data) - df.to_csv("ort_gpus.csv", index=False) + df.reset_index(drop=False).to_csv("ort_gpus.csv", index=False) else: print("No GPU is available but it should show something like the following.") df = pandas.read_csv("data/ort_gpus.csv") From 307e62b3d1826daf2d998dd4b05c5358418af204 Mon Sep 17 00:00:00 2001 From: xadupre Date: Sun, 13 Nov 2022 09:56:18 +0100 Subject: [PATCH 13/16] Update plot_parallel_execution.py --- _doc/examples/plot_parallel_execution.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index e410bd4a..80af19b9 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -339,7 +339,7 @@ def parallel_bind(sesss, imgs, N=1): df.reset_index(drop=False).to_csv("ort_cpu_gpu.csv", index=False) else: print("No GPU is available but it should show something like the following.") - df = pandas.read_csv("data/ort_cpu_gpu.csv") + df = pandas.read_csv("data/ort_cpu_gpu.csv").set_index("N") df @@ -406,7 +406,7 @@ def parallel_bind(sesss, imgs, N=1): df.reset_index(drop=False).to_csv("ort_gpus.csv", index=False) else: print("No GPU is available but it should show something like the following.") - df = pandas.read_csv("data/ort_gpus.csv") + df = pandas.read_csv("data/ort_gpus.csv").set_index("N") df From 431719468f0d224391873a3903248c1deb6c2a7e Mon Sep 17 00:00:00 2001 From: xadupre Date: Sun, 13 Nov 2022 10:02:47 +0100 Subject: [PATCH 14/16] Update plot_parallel_execution.py --- _doc/examples/plot_parallel_execution.py | 55 ++++++++++-------------- 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index 80af19b9..e05c6e28 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -190,12 +190,6 @@ def parallel(sesss, imgs, N=1): df.reset_index(drop=False).to_csv("ort_cpu.csv", index=False) df -##################################### -# Let's free the memory. - -del sesss[:] -gc.collect() - ########################################## # Plots # +++++ @@ -206,7 +200,7 @@ def make_plot(df, title): df["time_par_img"] = df["time_par"] / df["n_imgs_par"] ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot( - title=title) + title=title, logy=True) ax.set_xlabel("batch size") ax.set_ylabel("s") return ax @@ -282,12 +276,15 @@ def parallel_bind(sesss, imgs, N=1): data.append(obs) -del sesss[:] -gc.collect() df = pandas.DataFrame(data) df.reset_index(drop=False).to_csv("ort_cpu_bind.csv", index=False) df +##################################### +# Let's free the memory. + +del sesss[:] +gc.collect() ############################ # Plots. @@ -303,12 +300,21 @@ def parallel_bind(sesss, imgs, N=1): has_cuda = "CUDAExecutionProvider" in get_all_providers() if not has_cuda: print(f"No CUDA provider was detected in {get_all_providers()}.") + +n_gpus = torch.cuda.device_count() if has_cuda else 0 +if n_gpus == 0: + print("No GPU or one GPU was detected.") +elif n_gpus == 1: + print("1 GPU was detected.") +else: + print(f"{n_gpus} GPUs were detected.") + ######################################### # Parallelization GPU + CPU # +++++++++++++++++++++++++ -if has_cuda: +if has_cuda and n_gpus > 0: n_threads = 2 sesss = [InferenceSession(model_name, providers=["CPUExecutionProvider"]), InferenceSession(model_name, providers=["CUDAExecutionProvider", @@ -338,7 +344,7 @@ def parallel_bind(sesss, imgs, N=1): df = pandas.DataFrame(data) df.reset_index(drop=False).to_csv("ort_cpu_gpu.csv", index=False) else: - print("No GPU is available but it should show something like the following.") + print("No GPU is available but data should be like the following.") df = pandas.read_csv("data/ort_cpu_gpu.csv").set_index("N") df @@ -346,11 +352,7 @@ def parallel_bind(sesss, imgs, N=1): #################################### # Plots. -if has_cuda: - ax = make_plot(df, "Time per image / batch size\nCPU + GPU") -else: - ax = None - +ax = make_plot(df, "Time per image / batch size\nCPU + GPU") ax #################################### @@ -362,15 +364,6 @@ def parallel_bind(sesss, imgs, N=1): # Parallelization on multiple GPUs # ++++++++++++++++++++++++++++++++ -n_gpus = torch.cuda.device_count() if has_cuda else 0 -if n_gpus == 0: - print("No GPU or one GPU was detected.") -elif n_gpus == 1: - print("1 GPU was detected.") -else: - print(f"{n_gpus} GPUs were detected.") - - if n_gpus > 1: n_threads = 2 sesss = [] @@ -405,7 +398,7 @@ def parallel_bind(sesss, imgs, N=1): df = pandas.DataFrame(data) df.reset_index(drop=False).to_csv("ort_gpus.csv", index=False) else: - print("No GPU is available but it should show something like the following.") + print("No GPU is available but data should be like the following.") df = pandas.read_csv("data/ort_gpus.csv").set_index("N") df @@ -414,15 +407,11 @@ def parallel_bind(sesss, imgs, N=1): #################################### # Plots. -if n_gpus > 1: - ax = make_plot(df, "Time per image / batch size\nCPU + GPU") -else: - ax = None - +ax = make_plot(df, f"Time per image / batch size\n{n_gpus} GPUs") ax #################################### # The parallelization on mulitple GPUs did work. -# import matplotlib.pyplot as plt -# plt.show() +import matplotlib.pyplot as plt +plt.show() From bbb3aea45d13c89186cd346db4b60461a8f20d4b Mon Sep 17 00:00:00 2001 From: Xavier Dupre Date: Mon, 14 Nov 2022 08:06:42 +0000 Subject: [PATCH 15/16] update --- _doc/examples/data/ort_cpu_gpu.csv | 22 +++---- _doc/examples/plot_parallel_execution.py | 78 ++++++++++++++++-------- 2 files changed, 63 insertions(+), 37 deletions(-) diff --git a/_doc/examples/data/ort_cpu_gpu.csv b/_doc/examples/data/ort_cpu_gpu.csv index 97510823..562954e5 100644 --- a/_doc/examples/data/ort_cpu_gpu.csv +++ b/_doc/examples/data/ort_cpu_gpu.csv @@ -1,11 +1,11 @@ -N,n_imgs_seq,time_seq,n_imgs_par,time_par -1,2,0.5849514909932623,2,0.02292487349768635 -3,6,0.01547511100943666,6,0.01833898900076747 -5,10,0.036359535486553796,10,0.0324736335023772 -7,14,0.04304601749754511,14,0.039623103497433476 -9,18,0.06215578749834094,18,0.04699261850328185 -11,22,0.0860430364991771,22,0.05861812600051053 -13,26,0.0699024919886142,26,0.06959964999987278 -15,30,0.09830122849962208,30,0.08662367198849097 -17,34,0.11123420301009901,34,0.10407247900729999 -19,38,0.10265450000588316,38,0.09627137100324035 +index,N,n_imgs_seq_cpu,time_seq_cpu,n_imgs_seq_gpu,time_seq_gpu,n_imgs_par,time_par +0,1,2,0.00826602200686466,2,0.5539164490037365,2,0.008887254502042197 +1,3,6,0.019666356995003298,6,0.010879299501539208,6,0.02050348349439446 +2,5,10,0.03761136099637952,10,0.024563621496781707,10,0.025345650006784126 +3,7,14,0.05642429149884265,14,0.03381696599535644,14,0.03412535750248935 +4,9,18,0.061089862501830794,18,0.032227409988990985,18,0.051062139493296854 +5,11,22,0.08963397399929818,22,0.03988744800153654,22,0.049899421486770734 +6,13,26,0.08532479300629348,26,0.059844546995009296,26,0.06177879350434523 +7,15,30,0.099295007501496,30,0.060137088992632926,30,0.07637454000359867 +8,17,34,0.11972474250069354,34,0.08182918949751183,34,0.0649502059968654 +9,19,38,0.1271352384937927,38,0.06829091400140896,38,0.07087059249170125 diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index e05c6e28..190a2d59 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -14,7 +14,7 @@ For a big model such as a deeplearning model, this might be interesting. However, :epkg:`onnxruntime` already parallelize the computation of every operator (Gemm, MatMul) using all the CPU it can get so this approach -should show significant result when used on different processors (CPU, GPU) +should show significant results when used on different processors (CPU, GPU) in parallel. .. contents:: @@ -101,8 +101,7 @@ def download_file(url, name, min_size): # Parallelization # +++++++++++++++ # -# We define the number of threads as the number of cores divided by 2. -# This is a dummy value. It should let a core to handle the main program. +# We define a number of threads lower than the number of cores. n_threads = min(8, multiprocessing.cpu_count() - 1) print(f"n_threads={n_threads}") @@ -118,15 +117,15 @@ def download_file(url, name, min_size): # Let's measure the time for a sequence of images. -def sequence(sesss, imgs, N=1): +def sequence(sess, imgs, N=1): res = [] - for sess, img in zip(sesss, imgs): + for img in imgs: for i in range(N): res.append(sess.run(None, {input_name: img})[0]) return res -print(measure_time(lambda: sequence(sesss, imgs), +print(measure_time(lambda: sequence(sesss[0], imgs), div_by_number=True, repeat=2, number=2)) ################################# @@ -174,7 +173,7 @@ def parallel(sesss, imgs, N=1): for N in tqdm.tqdm(range(1, maxN, 2)): begin = time.perf_counter() for i in range(rep): - res1 = sequence(sesss, imgs, N) + res1 = sequence(sesss[0], imgs, N) end = (time.perf_counter() - begin) / rep obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end) @@ -196,11 +195,19 @@ def parallel(sesss, imgs, N=1): def make_plot(df, title): - df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"] - df["time_par_img"] = df["time_par"] / df["n_imgs_par"] - ax = df[["n_imgs_seq", "time_seq_img", "time_par_img"]].set_index("n_imgs_seq").plot( - title=title, logy=True) + kwargs = dict(title=title, logy=True) + if "time_seq" in df.columns: + df["time_seq_img"] = df["time_seq"] / df["n_imgs_seq"] + df["time_par_img"] = df["time_par"] / df["n_imgs_par"] + columns = ["n_imgs_seq", "time_seq_img", "time_par_img"] + else: + df["time_seq_img_cpu"] = df["time_seq_cpu"] / df["n_imgs_seq_cpu"] + df["time_seq_img_gpu"] = df["time_seq_gpu"] / df["n_imgs_seq_gpu"] + df["time_par_img"] = df["time_par"] / df["n_imgs_par"] + columns = ["n_imgs_seq_cpu", "time_seq_img_cpu", "time_seq_img_gpu", "time_par_img"] + + ax = df[columns].set_index(columns[0]).plot(**kwargs) ax.set_xlabel("batch size") ax.set_ylabel("s") return ax @@ -209,9 +216,10 @@ def make_plot(df, title): make_plot(df, "Time per image / batch size") ####################################### -# As expected, it does not really improve. It is like parallezing using -# both strategies, per kernel and per image, both trying to access all -# the process cores. +# As expected, it does not improve. It is like parallezing using +# two strategies, per kernel and per image, both trying to access all +# the process cores at the same time. The time spent to synchronize +# is significant. ################################################### # Same with another API based on OrtValue @@ -264,7 +272,7 @@ def parallel_bind(sesss, imgs, N=1): for N in tqdm.tqdm(range(1, maxN, 2)): begin = time.perf_counter() for i in range(rep): - res1 = sequence(sesss, imgs, N) + res1 = sequence(sesss[0], imgs, N) end = (time.perf_counter() - begin) / rep obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end) @@ -291,11 +299,16 @@ def parallel_bind(sesss, imgs, N=1): make_plot(df, "Time per image / batch size\nrun_with_iobinding") +######################################## +# It leads to the same conclusion. It is no use to parallelize +# on CPU as onnxruntime is already doing that per kernel. + + ######################################## # GPU # === # -# Let's check first it is possible. +# Let's check first if it is possible. has_cuda = "CUDAExecutionProvider" in get_all_providers() if not has_cuda: @@ -327,9 +340,15 @@ def parallel_bind(sesss, imgs, N=1): for N in tqdm.tqdm(range(1, maxN, 2)): begin = time.perf_counter() for i in range(rep): - res1 = sequence(sesss, imgs, N) + res1 = sequence(sesss[0], imgs, N) + end = (time.perf_counter() - begin) / rep + obs = dict(N=N, n_imgs_seq_cpu=len(res1), time_seq_cpu=end) + + begin = time.perf_counter() + for i in range(rep): + res2 = sequence(sesss[1], imgs, N) end = (time.perf_counter() - begin) / rep - obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end) + obs.update(dict(n_imgs_seq_gpu=len(res2), time_seq_gpu=end)) begin = time.perf_counter() for i in range(rep): @@ -356,13 +375,14 @@ def parallel_bind(sesss, imgs, N=1): ax #################################### -# The parallelization on mulitple CPU + GPUs is not really -# working with this simple setting. GPU should take more -# images as it is faster. +# The parallelization on mulitple CPU + GPUs is working, it is faster than CPU +# but it is still slower than using a single GPU in that case. ######################################### # Parallelization on multiple GPUs # ++++++++++++++++++++++++++++++++ +# +# This is the only case for which it should work as every GPU is indenpendent. if n_gpus > 1: n_threads = 2 @@ -381,9 +401,15 @@ def parallel_bind(sesss, imgs, N=1): for N in tqdm.tqdm(range(1, maxN, 2)): begin = time.perf_counter() for i in range(rep): - res1 = sequence(sesss, imgs, N) + res1 = sequence(sess1, imgs, N) + end = (time.perf_counter() - begin) / rep + obs = dict(N=N, n_imgs_seq_cpu=len(res1), time_seq_cpu=end) + + begin = time.perf_counter() + for i in range(rep): + res2 = sequence(sesss[0], imgs, N) end = (time.perf_counter() - begin) / rep - obs = dict(N=N, n_imgs_seq=len(res1), time_seq=end) + obs.update(dict(n_imgs_seq_gpu=len(res2), time_seq_gpu=end)) begin = time.perf_counter() for i in range(rep): @@ -411,7 +437,7 @@ def parallel_bind(sesss, imgs, N=1): ax #################################### -# The parallelization on mulitple GPUs did work. +# The parallelization on multiple GPUs did work. -import matplotlib.pyplot as plt -plt.show() +# import matplotlib.pyplot as plt +# plt.show() From 788dc57f6530ea236be74915282ad0f3a3a89ed0 Mon Sep 17 00:00:00 2001 From: xadupre Date: Mon, 14 Nov 2022 09:33:09 +0100 Subject: [PATCH 16/16] update --- .gitignore | 3 +++ _doc/examples/data/ort_gpus.csv | 22 +++++++++++----------- _doc/examples/plot_parallel_execution.py | 10 +++++++--- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 1cc45db6..367ddcc6 100644 --- a/.gitignore +++ b/.gitignore @@ -61,3 +61,6 @@ _unittests/ut_documentation/summary.csv _unittests/ut_documentation/_test_example.txt _unittests/ut_documentation/_test_example.txt something +_doc/examples/ort_cpu_bind.csv +_doc/examples/ort_cpu_gpu.csv +_doc/examples/ort_cpu.csv diff --git a/_doc/examples/data/ort_gpus.csv b/_doc/examples/data/ort_gpus.csv index 6d00bee4..1447d1e4 100644 --- a/_doc/examples/data/ort_gpus.csv +++ b/_doc/examples/data/ort_gpus.csv @@ -1,11 +1,11 @@ -N,n_imgs_seq,time_seq,n_imgs_par,time_par -1,2,0.7691331224923488,2,0.0028955735033378005 -3,6,0.010057882987894118,6,0.005550952511839569 -5,10,0.017624731990508735,10,0.009006410502479412 -7,14,0.030596987504395656,14,0.012265623998246156 -9,18,0.0391014114866266,18,0.015631284506525844 -11,22,0.047721832495881245,22,0.0192174395051552 -13,26,0.04471722950984258,26,0.02277063950896263 -15,30,0.06558763500652276,30,0.025715417010360397 -17,34,0.07659106400387827,34,0.029424325504805893 -19,38,0.07083825548761524,38,0.03291438950691372 +index,N,n_imgs_seq_cpu,time_seq_cpu,n_imgs_seq_gpu,time_seq_gpu,n_imgs_par,time_par +0,1,2,0.009441936999792233,2,0.27651189200696535,2,0.4723829315043986 +1,3,6,0.02211003350384999,6,0.010779099495266564,6,0.006075980491004884 +2,5,10,0.03130707699165214,10,0.018668279502890073,10,0.009614631999284029 +3,7,14,0.050966479000635445,14,0.02530089499487076,14,0.013326078507816419 +4,9,18,0.051026727494900115,18,0.03217840299475938,18,0.016715533987735398 +5,11,22,0.06609680749534164,22,0.03952224800013937,22,0.02055577700957656 +6,13,26,0.07877145400561858,26,0.046802844997728243,26,0.023463245990569703 +7,15,30,0.12377040000865236,30,0.05456937898998149,30,0.027040796499932185 +8,17,34,0.09810231548908632,34,0.06142483800067566,34,0.030876389500917867 +9,19,38,0.12459791499713901,38,0.07050566599355079,38,0.034905767504824325 diff --git a/_doc/examples/plot_parallel_execution.py b/_doc/examples/plot_parallel_execution.py index 190a2d59..3a544e83 100644 --- a/_doc/examples/plot_parallel_execution.py +++ b/_doc/examples/plot_parallel_execution.py @@ -57,7 +57,10 @@ def download_file(url, name, min_size): small = True -if small: +if small == "custom": + model_name = "custom.onnx" + url_name = None +elif small: model_name = "mobilenetv2-10.onnx" url_name = ("https://github.com/onnx/models/raw/main/vision/" "classification/mobilenet/model") @@ -205,7 +208,8 @@ def make_plot(df, title): df["time_seq_img_cpu"] = df["time_seq_cpu"] / df["n_imgs_seq_cpu"] df["time_seq_img_gpu"] = df["time_seq_gpu"] / df["n_imgs_seq_gpu"] df["time_par_img"] = df["time_par"] / df["n_imgs_par"] - columns = ["n_imgs_seq_cpu", "time_seq_img_cpu", "time_seq_img_gpu", "time_par_img"] + columns = ["n_imgs_seq_cpu", "time_seq_img_cpu", + "time_seq_img_gpu", "time_par_img"] ax = df[columns].set_index(columns[0]).plot(**kwargs) ax.set_xlabel("batch size") @@ -313,7 +317,7 @@ def parallel_bind(sesss, imgs, N=1): has_cuda = "CUDAExecutionProvider" in get_all_providers() if not has_cuda: print(f"No CUDA provider was detected in {get_all_providers()}.") - + n_gpus = torch.cuda.device_count() if has_cuda else 0 if n_gpus == 0: print("No GPU or one GPU was detected.")