sdpython · sdpython · Jan 19, 2021 · Jan 19, 2021 · Jan 19, 2021
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ td3a_cpp/tutorial/dot_cython.cpp
 td3a_cpp/tutorial/dot_blas_lapack.cpp
 td3a_cpp/tutorial/experiment_cython.cpp
 td3a_cpp/tutorial/dot_cython_omp.cpp
+td3a_cpp/tutorial/mul_cython_omp.cpp
diff --git a/bin/build.bat b/bin/build.bat
@@ -5,7 +5,7 @@ cd %root%
 @echo ##################
 @echo Compile
 @echo ##################
-set pythonexe="c:\Python391_x64\python.exe"
+set pythonexe="c:\Python387_x64\python.exe"
 if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
 
 @echo running %root%\setup.py build_ext --inplace

diff --git a/bin/doc.bat b/bin/doc.bat
@@ -2,7 +2,7 @@
 set current=%~dp0
 set root=%current%..
 cd %root%
-set pythonexe="c:\Python391_x64\python.exe"
+set pythonexe="c:\Python387_x64\python.exe"
 if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
 
 @echo running 'python -m sphinx -T -b html doc dist/html'

diff --git a/bin/flake8.bat b/bin/flake8.bat
@@ -2,7 +2,7 @@
 set current=%~dp0
 set root=%current%..
 cd %root%
-set pythonexe="c:\Python391_x64\python.exe"
+set pythonexe="c:\Python387_x64\python.exe"
 if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
 
 @echo running 'python -m flake8 td3a_cpp tests examples'

diff --git a/bin/unittest.bat b/bin/unittest.bat
@@ -2,7 +2,7 @@
 set current=%~dp0
 set root=%current%..
 cd %root%
-set pythonexe="c:\Python391_x64\python.exe"
+set pythonexe="c:\Python387_x64\python.exe"
 if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
 
 @echo running 'python -m unittest discover tests'

diff --git a/doc/api.rst b/doc/api.rst
@@ -60,7 +60,6 @@ dot
 
 .. autofunction:: td3a_cpp.tutorial.dot_cython_omp.ddot_array_openmp_16
 
-
 filter
 ^^^^^^
 
@@ -80,3 +79,7 @@ filter
 
 .. autofunction:: td3a_cpp.tutorial.experiment_cython.cfilter_dmax16
 
+matrix multiplication
+^^^^^^^^^^^^^^^^^^^^^
+
+.. autofunction:: td3a_cpp.tutorial.mul_cython_omp.dmul_cython_omp
diff --git a/examples/plot_benchmark_dot_mul.py b/examples/plot_benchmark_dot_mul.py
@@ -0,0 +1,152 @@
+"""
+
+.. _l-example-mul:
+
+Compares mul implementations
+============================
+
+:epkg:`numpy` has a very fast implementation of
+matrix multiplication. There are many ways to be slower.
+
+.. contents::
+    :local:
+"""
+
+import pprint
+import numpy
+from numpy.testing import assert_almost_equal
+import matplotlib.pyplot as plt
+from pandas import DataFrame, concat
+from td3a_cpp.tutorial.mul_cython_omp import dmul_cython_omp
+from td3a_cpp.tools import measure_time_dim
+
+dfs = []
+sets = list(range(2, 145, 20))
+
+##############################
+# numpy mul
+# +++++++++
+#
+
+ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
+             vb=numpy.random.randn(n, n).astype(numpy.float64),
+             mul=lambda x, y: x @ y,
+             x_name=n)
+        for n in sets]
+
+res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
+dfs.append(DataFrame(res))
+dfs[-1]['fct'] = 'numpy'
+pprint.pprint(dfs[-1].tail(n=2))
+
+
+##############################
+# Simple multiplication
+# +++++++++++++++++++++
+#
+
+ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
+             vb=numpy.random.randn(n, n).astype(numpy.float64),
+             mul=dmul_cython_omp,
+             x_name=n)
+        for n in sets]
+
+res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
+pprint.pprint(dfs[-1].tail(n=2))
+
+
+##############################
+# Other scenarios
+# +++++++++++++++
+#
+# 3 differents algorithms, each of them parallelized.
+# See :func:`dmul_cython_omp
+# <td3a_cpp.tutorial.mul_cython_omp.dmul_cython_omp>`.
+
+for algo in range(0, 2):
+    for parallel in (0, 1):
+        print("algo=%d parallel=%d" % (algo, parallel))
+        ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
+                     vb=numpy.random.randn(n, n).astype(numpy.float64),
+                     mul=lambda x, y: dmul_cython_omp(
+                        x, y, algo=algo, parallel=parallel),
+                     x_name=n)
+                for n in sets]
+
+        res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
+        dfs.append(DataFrame(res))
+        dfs[-1]['fct'] = 'a=%d-p=%d' % (algo, parallel)
+        pprint.pprint(dfs[-1].tail(n=2))
+
+########################################
+# One left issue
+# ++++++++++++++
+#
+# Will you find it in :func:`dmul_cython_omp
+# <td3a_cpp.tutorial.mul_cython_omp.dmul_cython_omp>`.
+
+
+va = numpy.random.randn(3, 4).astype(numpy.float64)
+vb = numpy.random.randn(4, 5).astype(numpy.float64)
+numpy_mul = va @ vb
+
+try:
+    for a in range(0, 50):
+        wrong_mul = dmul_cython_omp(va, vb, algo=2, parallel=1)
+        assert_almost_equal(numpy_mul, wrong_mul)
+        print("Iteration %d is Ok" % a)
+    print("All iterations are unexpectedly Ok. Don't push your luck.")
+except AssertionError as e:
+    print(e)
+
+
+##############################
+# Other scenarios but transposed
+# ++++++++++++++++++++++++++++++
+#
+# Same differents algorithms but the second matrix
+# is transposed first: ``b_trans=1``.
+
+
+for algo in range(0, 2):
+    for parallel in (0, 1):
+        print("algo=%d parallel=%d transposed" % (algo, parallel))
+        ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
+                     vb=numpy.random.randn(n, n).astype(numpy.float64),
+                     mul=lambda x, y: dmul_cython_omp(
+                        x, y, algo=algo, parallel=parallel, b_trans=1),
+                     x_name=n)
+                for n in sets]
+
+        res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=2))
+        dfs.append(DataFrame(res))
+        dfs[-1]['fct'] = 'a=%d-p=%d-T' % (algo, parallel)
+        pprint.pprint(dfs[-1].tail(n=2))
+
+
+#############################
+# Let's display the results
+# +++++++++++++++++++++++++
+
+cc = concat(dfs)
+cc['N'] = cc['x_name']
+
+fig, ax = plt.subplots(3, 2, figsize=(10, 8))
+cc[~cc.fct.str.contains('-T')].pivot('N', 'fct', 'average').plot(
+    logy=True, logx=True, ax=ax[0, 0])
+cc[~cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot(
+    'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[0, 1])
+cc[cc.fct.str.contains('-T') | (cc.fct == 'numpy')].pivot(
+    'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 0])
+cc[cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot(
+    'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 1])
+cc[cc.fct.str.contains('a=0')].pivot('N', 'fct', 'average').plot(
+    logy=True, logx=True, ax=ax[2, 1])
+fig.suptitle("Comparison of multiplication implementations")
+
+#################################
+# The results depends on the machine, its
+# number of cores, the compilation settings
+# of :epkg:`numpy` or this module.
+
+plt.show()
diff --git a/setup.py b/setup.py
@@ -58,7 +58,8 @@ def get_extension_tutorial(name):
     pattern1 = "td3a_cpp.tutorial.%s"
     srcs = ['td3a_cpp/tutorial/%s.pyx' % name]
     args = get_defined_args()
-    if name in ['dot_cython', 'experiment_cython', 'dot_cython_omp']:
+    if name in ['dot_cython', 'experiment_cython', 'dot_cython_omp',
+                'mul_cython_omp']:
         srcs.extend(['td3a_cpp/tutorial/%s_.cpp' % name])
         args['language'] = "c++"
 
@@ -115,7 +116,8 @@ def get_extension_tutorial(name):
 
 ext_modules = []
 for ext in ['dot_blas_lapack', 'dot_cython',
-            'experiment_cython', 'dot_cython_omp']:
+            'experiment_cython', 'dot_cython_omp',
+            'mul_cython_omp']:
     ext_modules.extend(get_extension_tutorial(ext))
 
 

diff --git a/td3a_cpp/tutorial/dot_cython_omp.pyx b/td3a_cpp/tutorial/dot_cython_omp.pyx
@@ -23,6 +23,9 @@ cdef double _ddot_cython_array_omp(const double[::1] va, const double[::1] vb,
 
     :param va: first vector, dtype must be float64
     :param vb: second vector, dtype must be float64
+    :param chunksize: see :epkg:`prange`
+    :param schedule: 0 no parallelization, 1 for `'static'`,
+        2 for `'dynamic'`
     :return: dot product
     """
     cdef int n = va.shape[0]
@@ -50,7 +53,8 @@ def ddot_cython_array_omp(const double[::1] va, const double[::1] vb,
     :param va: first vector, dtype must be float64
     :param vb: second vector, dtype must be float64
     :param chunksize: see :epkg:`prange`
-    :param schedule: see :epkg:`prange`
+    :param schedule: 0 simple :epkg:`prange`,
+        1 for `'static'`, 2 for `'dynamic'`
     :return: dot product
     """
     if va.shape[0] != vb.shape[0]: