diff --git a/.gitignore b/.gitignore index 78bb13d..98c85a9 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ td3a_cpp/tutorial/dot_cython.cpp td3a_cpp/tutorial/dot_blas_lapack.cpp td3a_cpp/tutorial/experiment_cython.cpp td3a_cpp/tutorial/dot_cython_omp.cpp +td3a_cpp/tutorial/mul_cython_omp.cpp diff --git a/bin/build.bat b/bin/build.bat index e0ddc18..a872d95 100644 --- a/bin/build.bat +++ b/bin/build.bat @@ -5,7 +5,7 @@ cd %root% @echo ################## @echo Compile @echo ################## -set pythonexe="c:\Python391_x64\python.exe" +set pythonexe="c:\Python387_x64\python.exe" if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe" @echo running %root%\setup.py build_ext --inplace diff --git a/bin/doc.bat b/bin/doc.bat index b199429..e7057f7 100644 --- a/bin/doc.bat +++ b/bin/doc.bat @@ -2,7 +2,7 @@ set current=%~dp0 set root=%current%.. cd %root% -set pythonexe="c:\Python391_x64\python.exe" +set pythonexe="c:\Python387_x64\python.exe" if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe" @echo running 'python -m sphinx -T -b html doc dist/html' diff --git a/bin/flake8.bat b/bin/flake8.bat index 2184057..ef1d989 100644 --- a/bin/flake8.bat +++ b/bin/flake8.bat @@ -2,7 +2,7 @@ set current=%~dp0 set root=%current%.. cd %root% -set pythonexe="c:\Python391_x64\python.exe" +set pythonexe="c:\Python387_x64\python.exe" if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe" @echo running 'python -m flake8 td3a_cpp tests examples' diff --git a/bin/unittest.bat b/bin/unittest.bat index 54576c8..45edd80 100644 --- a/bin/unittest.bat +++ b/bin/unittest.bat @@ -2,7 +2,7 @@ set current=%~dp0 set root=%current%.. cd %root% -set pythonexe="c:\Python391_x64\python.exe" +set pythonexe="c:\Python387_x64\python.exe" if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe" @echo running 'python -m unittest discover tests' diff --git a/doc/api.rst b/doc/api.rst index 3d2b2f4..b37adda 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -60,7 +60,6 @@ dot .. autofunction:: td3a_cpp.tutorial.dot_cython_omp.ddot_array_openmp_16 - filter ^^^^^^ @@ -80,3 +79,7 @@ filter .. autofunction:: td3a_cpp.tutorial.experiment_cython.cfilter_dmax16 +matrix multiplication +^^^^^^^^^^^^^^^^^^^^^ + +.. autofunction:: td3a_cpp.tutorial.mul_cython_omp.dmul_cython_omp diff --git a/examples/plot_benchmark_dot_mul.py b/examples/plot_benchmark_dot_mul.py new file mode 100644 index 0000000..c43e382 --- /dev/null +++ b/examples/plot_benchmark_dot_mul.py @@ -0,0 +1,152 @@ +""" + +.. _l-example-mul: + +Compares mul implementations +============================ + +:epkg:`numpy` has a very fast implementation of +matrix multiplication. There are many ways to be slower. + +.. contents:: + :local: +""" + +import pprint +import numpy +from numpy.testing import assert_almost_equal +import matplotlib.pyplot as plt +from pandas import DataFrame, concat +from td3a_cpp.tutorial.mul_cython_omp import dmul_cython_omp +from td3a_cpp.tools import measure_time_dim + +dfs = [] +sets = list(range(2, 145, 20)) + +############################## +# numpy mul +# +++++++++ +# + +ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64), + vb=numpy.random.randn(n, n).astype(numpy.float64), + mul=lambda x, y: x @ y, + x_name=n) + for n in sets] + +res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1)) +dfs.append(DataFrame(res)) +dfs[-1]['fct'] = 'numpy' +pprint.pprint(dfs[-1].tail(n=2)) + + +############################## +# Simple multiplication +# +++++++++++++++++++++ +# + +ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64), + vb=numpy.random.randn(n, n).astype(numpy.float64), + mul=dmul_cython_omp, + x_name=n) + for n in sets] + +res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1)) +pprint.pprint(dfs[-1].tail(n=2)) + + +############################## +# Other scenarios +# +++++++++++++++ +# +# 3 differents algorithms, each of them parallelized. +# See :func:`dmul_cython_omp +# `. + +for algo in range(0, 2): + for parallel in (0, 1): + print("algo=%d parallel=%d" % (algo, parallel)) + ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64), + vb=numpy.random.randn(n, n).astype(numpy.float64), + mul=lambda x, y: dmul_cython_omp( + x, y, algo=algo, parallel=parallel), + x_name=n) + for n in sets] + + res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1)) + dfs.append(DataFrame(res)) + dfs[-1]['fct'] = 'a=%d-p=%d' % (algo, parallel) + pprint.pprint(dfs[-1].tail(n=2)) + +######################################## +# One left issue +# ++++++++++++++ +# +# Will you find it in :func:`dmul_cython_omp +# `. + + +va = numpy.random.randn(3, 4).astype(numpy.float64) +vb = numpy.random.randn(4, 5).astype(numpy.float64) +numpy_mul = va @ vb + +try: + for a in range(0, 50): + wrong_mul = dmul_cython_omp(va, vb, algo=2, parallel=1) + assert_almost_equal(numpy_mul, wrong_mul) + print("Iteration %d is Ok" % a) + print("All iterations are unexpectedly Ok. Don't push your luck.") +except AssertionError as e: + print(e) + + +############################## +# Other scenarios but transposed +# ++++++++++++++++++++++++++++++ +# +# Same differents algorithms but the second matrix +# is transposed first: ``b_trans=1``. + + +for algo in range(0, 2): + for parallel in (0, 1): + print("algo=%d parallel=%d transposed" % (algo, parallel)) + ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64), + vb=numpy.random.randn(n, n).astype(numpy.float64), + mul=lambda x, y: dmul_cython_omp( + x, y, algo=algo, parallel=parallel, b_trans=1), + x_name=n) + for n in sets] + + res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=2)) + dfs.append(DataFrame(res)) + dfs[-1]['fct'] = 'a=%d-p=%d-T' % (algo, parallel) + pprint.pprint(dfs[-1].tail(n=2)) + + +############################# +# Let's display the results +# +++++++++++++++++++++++++ + +cc = concat(dfs) +cc['N'] = cc['x_name'] + +fig, ax = plt.subplots(3, 2, figsize=(10, 8)) +cc[~cc.fct.str.contains('-T')].pivot('N', 'fct', 'average').plot( + logy=True, logx=True, ax=ax[0, 0]) +cc[~cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot( + 'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[0, 1]) +cc[cc.fct.str.contains('-T') | (cc.fct == 'numpy')].pivot( + 'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 0]) +cc[cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot( + 'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 1]) +cc[cc.fct.str.contains('a=0')].pivot('N', 'fct', 'average').plot( + logy=True, logx=True, ax=ax[2, 1]) +fig.suptitle("Comparison of multiplication implementations") + +################################# +# The results depends on the machine, its +# number of cores, the compilation settings +# of :epkg:`numpy` or this module. + +plt.show() diff --git a/setup.py b/setup.py index b41dd94..9cd6808 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,8 @@ def get_extension_tutorial(name): pattern1 = "td3a_cpp.tutorial.%s" srcs = ['td3a_cpp/tutorial/%s.pyx' % name] args = get_defined_args() - if name in ['dot_cython', 'experiment_cython', 'dot_cython_omp']: + if name in ['dot_cython', 'experiment_cython', 'dot_cython_omp', + 'mul_cython_omp']: srcs.extend(['td3a_cpp/tutorial/%s_.cpp' % name]) args['language'] = "c++" @@ -115,7 +116,8 @@ def get_extension_tutorial(name): ext_modules = [] for ext in ['dot_blas_lapack', 'dot_cython', - 'experiment_cython', 'dot_cython_omp']: + 'experiment_cython', 'dot_cython_omp', + 'mul_cython_omp']: ext_modules.extend(get_extension_tutorial(ext)) diff --git a/td3a_cpp/tutorial/dot_cython_omp.pyx b/td3a_cpp/tutorial/dot_cython_omp.pyx index ca8c360..0a0fe1f 100644 --- a/td3a_cpp/tutorial/dot_cython_omp.pyx +++ b/td3a_cpp/tutorial/dot_cython_omp.pyx @@ -23,6 +23,9 @@ cdef double _ddot_cython_array_omp(const double[::1] va, const double[::1] vb, :param va: first vector, dtype must be float64 :param vb: second vector, dtype must be float64 + :param chunksize: see :epkg:`prange` + :param schedule: 0 no parallelization, 1 for `'static'`, + 2 for `'dynamic'` :return: dot product """ cdef int n = va.shape[0] @@ -50,7 +53,8 @@ def ddot_cython_array_omp(const double[::1] va, const double[::1] vb, :param va: first vector, dtype must be float64 :param vb: second vector, dtype must be float64 :param chunksize: see :epkg:`prange` - :param schedule: see :epkg:`prange` + :param schedule: 0 simple :epkg:`prange`, + 1 for `'static'`, 2 for `'dynamic'` :return: dot product """ if va.shape[0] != vb.shape[0]: diff --git a/td3a_cpp/tutorial/mul_cython_omp.pyx b/td3a_cpp/tutorial/mul_cython_omp.pyx new file mode 100644 index 0000000..56aa77a --- /dev/null +++ b/td3a_cpp/tutorial/mul_cython_omp.pyx @@ -0,0 +1,204 @@ +""" +Many implementations of the dot product. +See `Cython documentation `_. +""" +from cython.parallel import prange +cimport numpy as cnumpy +import numpy as pynumpy +cimport cython +from cpython cimport array +import array +cimport openmp +cnumpy.import_array() + + +cdef int _dmul_cython_omp(const double* va, const double* vb, double* res, + Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk, + cython.int algo, cython.int parallel) nogil: + """ + matrix multiplication product implemented with cython and C types + using :epkg:`prange` (:epkg:`openmp` in :epkg:`cython`). + + :param va: first matrix, dtype must be float64 + :param vb: second matrix, dtype must be float64 + :param res: result of the matrix multiplication + :param algo: algorithm (see below) + :param parallel: kind of parallelization (see below) + :return: matrix multiplication + """ + cdef double s + cdef Py_ssize_t p, i, j, k + if parallel == 0: + if algo == 0: + for i in range(0, ni): + for j in range(0, nj): + s = 0 + for k in range(0, nk): + s += va[i * nk + k] * vb[k * nj + j] + res[i * nj + j] = s + return 1 + + if algo == 1: + for j in range(0, nj): + for i in range(0, ni): + s = 0 + for k in range(0, nk): + s += va[i * nk + k] * vb[k * nj + j] + res[i * nj + j] = s + return 1 + + if algo == 2: + for k in range(0, nk): + for i in range(0, ni): + for j in range(0, nj): + res[i * nj + j] += va[i * nk + k] * vb[k * nj + j] + return 1 + + if parallel == 1: + if algo == 0: + for p in prange(0, ni): + for j in range(0, nj): + res[p * nj + j] = 0 + for k in range(0, nk): + res[p * nj + j] += va[p * nk + k] * vb[k * nj + j] + return 1 + + if algo == 1: + for p in prange(0, nj): + for i in range(0, ni): + res[i * nj + p] = 0 + for k in range(0, nk): + res[i * nj + p] += va[i * nk + k] * vb[k * nj + p] + return 1 + + if algo == 2: + for p in prange(0, nk): + for i in range(0, ni): + for j in range(0, nj): + res[i * nj + j] += va[i * nk + p] * vb[p * nj + j] + return 1 + + return 0 + + +cdef extern from "mul_cython_omp_.h": + cdef double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, cython.int size) nogil + + +cdef int _dmul_cython_omp_t(const double* va, const double* vb, double* res, + Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk, + cython.int algo, cython.int parallel) nogil: + """ + matrix multiplication product implemented with cython and C types + using :epkg:`prange` (:epkg:`openmp` in :epkg:`cython`). + + :param va: first matrix, dtype must be float64 + :param vb: second matrix, dtype must be float64 + :param res: result of the matrix multiplication + :param algo: algorithm (see below) + :param parallel: kind of parallelization (see below) + :return: matrix multiplication + """ + cdef double s + cdef Py_ssize_t p, i, j, k + if parallel == 0: + if algo == 0: + for i in range(0, ni): + for j in range(0, nj): + res[i * nj + j] = vector_ddot_product_pointer16_sse(&va[i * nk], &vb[j * nk], nk) + return 1 + + if algo == 1: + for j in range(0, nj): + for i in range(0, ni): + res[i * nj + j] = vector_ddot_product_pointer16_sse(&va[i * nk], &vb[j * nk], nk) + return 1 + + if algo == 2: + for k in range(0, nk): + for i in range(0, ni): + for j in range(0, nj): + res[i * nj + j] += va[i * nk + k] * vb[j * nk + k] + return 1 + + if parallel == 1: + if algo == 0: + for p in prange(0, ni): + for j in range(0, nj): + res[p * nj + j] = vector_ddot_product_pointer16_sse(&va[p * nk], &vb[j * nk], nk) + + return 1 + + if algo == 1: + for p in prange(0, nj): + for i in range(0, ni): + res[i * nj + p] = vector_ddot_product_pointer16_sse(&va[i * nk], &vb[p * nk], nk) + + return 1 + + if algo == 2: + for p in prange(0, nk): + for i in range(0, ni): + for j in range(0, nj): + res[i * nj + j] += va[i * nk + p] * vb[j * nk + p] + return 1 + + return 0 + + +@cython.boundscheck(False) +@cython.cdivision(True) +@cython.wraparound(False) +def dmul_cython_omp(const double[:, :] va, const double[:, :] vb, + cython.int algo=0, cython.int parallel=0, + cython.int b_trans=0): + """ + matrix multiplication product implemented with cython and C types + using :epkg:`prange` (:epkg:`openmp` in :epkg:`cython`). + + :param va: first matrix, dtype must be float64 + :param vb: second matrix, dtype must be float64 + :param algo: algorithm (see below) + :param parallel: kind of parallelization (see below) + :return: matrix multiplication + """ + cdef double[:, :] pres + cdef cython.int r + cdef const double *pva + cdef const double *pvb + cdef double *ppres + if b_trans: + if va.shape[1] != vb.shape[1]: + raise ValueError( + "Shape mismatch, cannot multiply %r and %r" % (va.shape, vb.shape)) + res = pynumpy.zeros((va.shape[0], vb.shape[0]), dtype=pynumpy.double) + pres = res + pva = &va[0, 0] + pvb = &vb[0, 0] + ppres = &pres[0, 0] + with nogil: + r = _dmul_cython_omp_t(pva, pvb, ppres, + va.shape[0], vb.shape[0], va.shape[1], + algo, parallel) + if r != 1: + raise RuntimeError( + "Unknown value for algo=%r or parallel=%d r=%d" % (algo, parallel, r)) + return res + else: + if va.shape[1] != vb.shape[0]: + raise ValueError( + "Shape mismatch, cannot multiply %r and %r" % (va.shape, vb.shape)) + res = pynumpy.zeros((va.shape[0], vb.shape[1]), dtype=pynumpy.double) + pres = res + pva = &va[0, 0] + pvb = &vb[0, 0] + ppres = &pres[0, 0] + with nogil: + r = _dmul_cython_omp(pva, pvb, ppres, + va.shape[0], vb.shape[1], va.shape[1], + algo, parallel) + if r != 1: + raise RuntimeError( + "Unknown value for algo=%r or parallel=%d r=%r" % (algo, parallel, r)) + return res + diff --git a/td3a_cpp/tutorial/mul_cython_omp_.cpp b/td3a_cpp/tutorial/mul_cython_omp_.cpp new file mode 100644 index 0000000..8301a81 --- /dev/null +++ b/td3a_cpp/tutorial/mul_cython_omp_.cpp @@ -0,0 +1,33 @@ +#include "mul_cython_omp_.h" +#include +#include // for double m128d + + +double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int size) { + if (size == 0) + return 0; + if (size == 1) + return *p1 * *p2; + double sum = 0; + const double* end = p1 + size; + if (size % 2 == 1) { + sum += *p1 * *p2; + ++p1; + ++p2; + --size; + } + __m128d c1 = _mm_loadu_pd(p1); + __m128d c2 = _mm_loadu_pd(p2); + __m128d r1 = _mm_mul_pd(c1, c2); + p1 += 2; + p2 += 2; + for( ; p1 != end; p1 += 2, p2 += 2) { + c1 = _mm_loadu_pd(p1); + c2 = _mm_loadu_pd(p2); + r1 = _mm_add_pd(r1, _mm_mul_pd(c1, c2)); + } + double r[2]; // r is not necessary aligned. + _mm_storeu_pd(r, r1); + sum += r[0] + r[1]; + return sum; +} diff --git a/td3a_cpp/tutorial/mul_cython_omp_.h b/td3a_cpp/tutorial/mul_cython_omp_.h new file mode 100644 index 0000000..5604a62 --- /dev/null +++ b/td3a_cpp/tutorial/mul_cython_omp_.h @@ -0,0 +1,13 @@ +#pragma once + + +#if !defined(_CRT_SECURE_NO_WARNINGS) +#define _CRT_SECURE_NO_WARNINGS +#define undef__CRT_SECURE_NO_WARNINGS 1 +#endif + +double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int size); + +#if defined(undef_CRT_SECURE_NO_WARNINGS) +#undef _CRT_SECURE_NO_WARNINGS +#endif diff --git a/tests/test_tutorial_mul.py b/tests/test_tutorial_mul.py new file mode 100644 index 0000000..60ba39d --- /dev/null +++ b/tests/test_tutorial_mul.py @@ -0,0 +1,95 @@ +""" +Unit tests for ``random_strategy``. +""" +import unittest +import numpy +from numpy.testing import assert_almost_equal +from td3a_cpp.tutorial.mul_cython_omp import dmul_cython_omp + + +class TestTutorialMul(unittest.TestCase): + + def test_matrix_mul(self): + va = numpy.random.randn(3, 4).astype(numpy.float64) + vb = numpy.random.randn(4, 5).astype(numpy.float64) + res1 = va @ vb + res2 = dmul_cython_omp(va, vb) + assert_almost_equal(res1, res2) + + def test_matrix_mul_fail(self): + va = numpy.random.randn(3, 4).astype(numpy.float64) + vb = numpy.random.randn(4, 5).astype(numpy.float64) + with self.assertRaises(RuntimeError): + dmul_cython_omp(va, vb, algo=4) + + def test_matrix_mul_algo(self): + va = numpy.random.randn(3, 4).astype(numpy.float64) + vb = numpy.random.randn(4, 5).astype(numpy.float64) + res1 = va @ vb + for algo in range(0, 3): + with self.subTest(algo=algo): + res2 = dmul_cython_omp(va, vb, algo=algo) + assert_almost_equal(res1, res2) + + def test_matrix_mul_algo_para(self): + va = numpy.random.randn(3, 4).astype(numpy.float64) + vb = numpy.random.randn(4, 5).astype(numpy.float64) + res1 = va @ vb + for algo in range(0, 2): + with self.subTest(algo=algo): + res2 = dmul_cython_omp(va, vb, algo=algo, parallel=1) + assert_almost_equal(res1, res2) + + def test_matrix_mul_algo_t(self): + va = numpy.random.randn(3, 4).astype(numpy.float64) + vb = numpy.random.randn(5, 4).astype(numpy.float64) + res1 = va @ vb.T + for algo in range(0, 3): + with self.subTest(algo=algo): + res2 = dmul_cython_omp(va, vb, algo=algo, + b_trans=1) + assert_almost_equal(res1, res2) + + def test_matrix_mul_algo_t_big(self): + va = numpy.random.randn(300, 400).astype(numpy.float64) + vb = numpy.random.randn(500, 400).astype(numpy.float64) + res1 = va @ vb.T + for algo in range(0, 3): + with self.subTest(algo=algo): + res2 = dmul_cython_omp(va, vb, algo=algo, + b_trans=1) + assert_almost_equal(res1, res2) + + def test_matrix_mul_algo_t_big_odd(self): + va = numpy.random.randn(30, 41).astype(numpy.float64) + vb = numpy.random.randn(50, 41).astype(numpy.float64) + res1 = va @ vb.T + for algo in range(0, 3): + with self.subTest(algo=algo): + res2 = dmul_cython_omp(va, vb, algo=algo, + b_trans=1) + assert_almost_equal(res1, res2) + + def test_matrix_mul_algo_para_t(self): + va = numpy.random.randn(3, 4).astype(numpy.float64) + vb = numpy.random.randn(5, 4).astype(numpy.float64) + res1 = va @ vb.T + for algo in range(0, 2): + with self.subTest(algo=algo): + res2 = dmul_cython_omp(va, vb, algo=algo, parallel=1, + b_trans=1) + assert_almost_equal(res1, res2) + + def test_matrix_mul_algo_para_t_big(self): + va = numpy.random.randn(300, 400).astype(numpy.float64) + vb = numpy.random.randn(500, 400).astype(numpy.float64) + res1 = va @ vb.T + for algo in range(0, 2): + with self.subTest(algo=algo): + res2 = dmul_cython_omp(va, vb, algo=algo, parallel=1, + b_trans=1) + assert_almost_equal(res1, res2) + + +if __name__ == '__main__': + unittest.main()