diff --git a/bin/flake8.bat b/bin/flake8.bat index 8d51465..7de095f 100644 --- a/bin/flake8.bat +++ b/bin/flake8.bat @@ -2,8 +2,7 @@ set current=%~dp0 set root=%current%.. cd %root% -set pythonexe="c:\Python387_x64\python.exe" -if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe" +set pythonexe=python @echo running 'python -m flake8 td3a_cpp tests examples' %pythonexe% -m flake8 td3a_cpp tests examples setup.py doc/conf.py diff --git a/doc/conf.py b/doc/conf.py index 87e569a..10ac465 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -134,6 +134,7 @@ 'python': 'https://www.python.org/', 'py-spy': 'https://github.com/benfred/py-spy', 'RUST': 'https://www.rust-lang.org/', + 'scikit-learn': 'https://scikit-learn.org/stable/', 'sphinx-gallery': 'https://github.com/sphinx-gallery/sphinx-gallery', 'SSE': 'https://fr.wikipedia.org/wiki/Streaming_SIMD_Extensions', 'tqdm': 'https://github.com/tqdm/tqdm', diff --git a/examples/plot_benchmark_dot.py b/examples/plot_benchmark_dot.py index b1d54e0..4182bac 100644 --- a/examples/plot_benchmark_dot.py +++ b/examples/plot_benchmark_dot.py @@ -10,6 +10,12 @@ to be slower. This example looks into a couple of slower implementations. +Compared implementations: + +* :func:`pydot ` +* `cblas_ddot `_ + .. contents:: :local: """ diff --git a/examples/plot_benchmark_dot_cython.py b/examples/plot_benchmark_dot_cython.py index 7257ded..ff7cf2e 100644 --- a/examples/plot_benchmark_dot_cython.py +++ b/examples/plot_benchmark_dot_cython.py @@ -12,12 +12,24 @@ the following: * :func:`dot_product ` + `code `_ * :func:`ddot_cython_array ` + `code `_ * :func:`ddot_cython_array_optim ` + `code `_ * :func:`ddot_array ` + `code `_ * :func:`ddot_array_16 ` + `code `_ * :func:`ddot_array_16_sse ` + `code `_ .. contents:: :local: diff --git a/examples/plot_benchmark_dot_cython_omp.py b/examples/plot_benchmark_dot_cython_omp.py index cba93bf..7b20192 100644 --- a/examples/plot_benchmark_dot_cython_omp.py +++ b/examples/plot_benchmark_dot_cython_omp.py @@ -12,12 +12,20 @@ the following: * :func:`ddot_array_16_sse ` + `code `_ * :func:`ddot_cython_array_omp ` + `code `_ * :func:`ddot_array_openmp ` + `code `_ * :func:`ddot_array_openmp_16 ` + `code `_ .. contents:: :local: diff --git a/examples/plot_benchmark_dot_mul.py b/examples/plot_benchmark_dot_mul.py index 69c375c..0e56e2d 100644 --- a/examples/plot_benchmark_dot_mul.py +++ b/examples/plot_benchmark_dot_mul.py @@ -2,12 +2,18 @@ .. _l-example-mul: -Compares mul implementations -============================ +Compares matrix multiplication implementations +============================================== :epkg:`numpy` has a very fast implementation of matrix multiplication. There are many ways to be slower. +Compared implementations: + +* :func:`dmul_cython_omp ` + `code `_ + .. contents:: :local: """ @@ -52,7 +58,7 @@ for n in sets] res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1)) -pprint.pprint(dfs[-1].tail(n=2)) +pprint.pprint(res[-1]) ############################## @@ -131,18 +137,23 @@ cc = concat(dfs) cc['N'] = cc['x_name'] -fig, ax = plt.subplots(3, 2, figsize=(10, 8)) -cc[~cc.fct.str.contains('-T')].pivot('N', 'fct', 'average').plot( - logy=True, logx=True, ax=ax[0, 0]) -cc[~cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot( +fig, ax = plt.subplots(3, 2, figsize=(10, 8), sharex=True, sharey=True) +ccnp = cc.fct == 'numpy' +cct = cc.fct.str.contains('-T') +cca0 = cc.fct.str.contains('a=0') +cc[ccnp | (~cct & cca0)].pivot( + 'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[0, 0]) +cc[ccnp | (~cct & ~cca0)].pivot( 'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[0, 1]) -cc[cc.fct.str.contains('-T') | (cc.fct == 'numpy')].pivot( +cc[ccnp | (cct & cca0)].pivot( 'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 0]) -cc[cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot( +cc[ccnp | (~cct & ~cca0)].pivot( 'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 1]) -cc[cc.fct.str.contains('a=0')].pivot('N', 'fct', 'average').plot( +cc[ccnp | cca0].pivot('N', 'fct', 'average').plot( + logy=True, logx=True, ax=ax[2, 0]) +cc[ccnp | ~cca0].pivot('N', 'fct', 'average').plot( logy=True, logx=True, ax=ax[2, 1]) -fig.suptitle("Comparison of multiplication implementations") +fig.suptitle("Comparison of matrix multiplication implementations") ################################# # The results depends on the machine, its diff --git a/examples/plot_benchmark_dot_mul_timeit.py b/examples/plot_benchmark_dot_mul_timeit.py index 3b43583..eae31f7 100644 --- a/examples/plot_benchmark_dot_mul_timeit.py +++ b/examples/plot_benchmark_dot_mul_timeit.py @@ -2,12 +2,29 @@ .. _l-example-mul-timeit: -Compares mul implementations with timeit -======================================== +Compares matrix multiplication implementations with timeit +========================================================== :epkg:`numpy` has a very fast implementation of matrix multiplication. There are many ways to be slower. -The following uses :epkg:`timeit` to compare implementations. +The following uses :mod:`timeit` to compare implementations. + +Compared implementations: + +* :func:`multiply_matrix ` + `code `_ +* :func:`c_multiply_matrix ` + `code `_ +* :func:`c_multiply_matrix_parallel + ` + `code `_ +* :func:`c_multiply_matrix_parallel_transposed + ` + `code `_ .. contents:: :local: diff --git a/examples/plot_benchmark_filter.py b/examples/plot_benchmark_filter.py index 310bf5b..b057b05 100644 --- a/examples/plot_benchmark_filter.py +++ b/examples/plot_benchmark_filter.py @@ -11,15 +11,32 @@ to :epkg:`numpy`. * :func:`cfilter_dmax ` + `code `_ * :func:`cfilter_dmax2 ` + `code `_ * :func:`cfilter_dmax4 ` + `code `_ * :func:`cfilter_dmax16 ` + `code `_ * :func:`cyfilter_dmax ` + `code `_ * :func:`filter_dmax_cython ` + `code `_ * :func:`filter_dmax_cython_optim ` -* :func:`pyfilter_dmax ` + `code `_ +* :func:`pyfilter_dmax + ` + `code `_ """ import pprint diff --git a/examples/plot_long_parallel_process_joblib.py b/examples/plot_long_parallel_process_joblib.py index bb1b1e1..184659d 100644 --- a/examples/plot_long_parallel_process_joblib.py +++ b/examples/plot_long_parallel_process_joblib.py @@ -6,7 +6,7 @@ ======================================================== Uses processes to parallelize a dot product is not -a very solution becausep processes do not share memory, +a very solution because processes do not share memory, they need to exchange data. This parallelisation is efficient if the ratio *exchanged data / computation time* is low. :epkg:`joblib` is used by :epkg:`scikit-learn`. diff --git a/examples/plot_parallel_process_concurrent.py b/examples/plot_parallel_process_concurrent.py index b50db81..c977147 100644 --- a/examples/plot_parallel_process_concurrent.py +++ b/examples/plot_parallel_process_concurrent.py @@ -6,7 +6,7 @@ ==================================================================== Uses processes to parallelize a dot product is not -a very solution becausep processes do not share memory, +a very solution because processes do not share memory, they need to exchange data. This parallelisation is efficient if the ratio *exchanged data / computation time* is low. This example uses :epkg:`concurrent.futures`. diff --git a/examples/plot_profile.py b/examples/plot_profile.py index 2afa1e2..6b803a2 100644 --- a/examples/plot_profile.py +++ b/examples/plot_profile.py @@ -11,7 +11,6 @@ it is usually helpful to call them multiple times before the cause becomes significant. - .. content:: :local: """ @@ -96,6 +95,8 @@ def f3_blas(va, vb, n=100000): ####################################### # See :ref:`l-appendix-example-dot-profile`. -# We see that :func:`cblas_ddot` and `numpy.dot` uses +# We see that :func:`cblas_ddot +# ` +# and :func:`numpy.dot` uses # the same C function but the wrapping is not the same # and numpy is more efficient. diff --git a/td3a_cpp/tutorial/dot_cython_.cpp b/td3a_cpp/tutorial/dot_cython_.cpp index a6694d3..8da5a23 100644 --- a/td3a_cpp/tutorial/dot_cython_.cpp +++ b/td3a_cpp/tutorial/dot_cython_.cpp @@ -6,8 +6,7 @@ #define BYN 16 -double vector_ddot_product_pointer16(const double *p1, const double *p2) -{ +double vector_ddot_product_pointer16(const double *p1, const double *p2) { // Branching optimization must be done in a separate function. double sum = 0; @@ -33,8 +32,7 @@ double vector_ddot_product_pointer16(const double *p1, const double *p2) } -double vector_ddot_product_pointer16(const double *p1, const double *p2, int size) -{ +double vector_ddot_product_pointer16(const double *p1, const double *p2, int size) { double sum = 0; int i = 0; if (size >= BYN) { @@ -49,8 +47,7 @@ double vector_ddot_product_pointer16(const double *p1, const double *p2, int siz } -float vector_sdot_product_pointer16(const float *p1, const float *p2) -{ +float vector_sdot_product_pointer16(const float *p1, const float *p2) { // Branching optimization must be done in a separate function. float sum = 0; @@ -76,8 +73,7 @@ float vector_sdot_product_pointer16(const float *p1, const float *p2) } -float vector_sdot_product_pointer16(const float *p1, const float *p2, int size) -{ +float vector_sdot_product_pointer16(const float *p1, const float *p2, int size) { float sum = 0; int i = 0; if (size >= BYN) { @@ -103,8 +99,7 @@ float vector_sdot_product_pointer16(const float *p1, const float *p2, int size) #include // double double m256d -double vector_ddot_product_pointer16_sse(const double *p1, const double *p2) -{ +double vector_ddot_product_pointer16_sse(const double *p1, const double *p2) { __m256d c1 = _mm256_load_pd(p1); __m256d c2 = _mm256_load_pd(p2); __m256d r1 = _mm256_mul_pd(c1, c2); @@ -141,8 +136,7 @@ double vector_ddot_product_pointer16_sse(const double *p1, const double *p2) #include // for double m128d -double vector_ddot_product_pointer16_sse(const double *p1, const double *p2) -{ +double vector_ddot_product_pointer16_sse(const double *p1, const double *p2) { __m128d c1 = _mm_load_pd(p1); __m128d c2 = _mm_load_pd(p2); __m128d r1 = _mm_mul_pd(c1, c2); @@ -207,8 +201,7 @@ double vector_ddot_product_pointer16_sse(const double *p1, const double *p2) #endif -double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int size) -{ +double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int size) { double sum = 0; int i = 0; if (size >= BYN) { @@ -224,8 +217,7 @@ double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int #include // for float m128 -float vector_sdot_product_pointer16_sse(const float *p1, const float *p2) -{ +float vector_sdot_product_pointer16_sse(const float *p1, const float *p2) { __m128 c1 = _mm_load_ps(p1); __m128 c2 = _mm_load_ps(p2); __m128 r1 = _mm_mul_ps(c1, c2); @@ -258,8 +250,7 @@ float vector_sdot_product_pointer16_sse(const float *p1, const float *p2) } -float vector_sdot_product_pointer16_sse(const float *p1, const float *p2, int size) -{ +float vector_sdot_product_pointer16_sse(const float *p1, const float *p2, int size) { float sum = 0; int i = 0; if (size >= BYN) { diff --git a/td3a_cpp/tutorial/dot_cython_omp_.cpp b/td3a_cpp/tutorial/dot_cython_omp_.cpp index ffb837e..59eedd4 100644 --- a/td3a_cpp/tutorial/dot_cython_omp_.cpp +++ b/td3a_cpp/tutorial/dot_cython_omp_.cpp @@ -14,8 +14,7 @@ int get_omp_max_threads_cpp() double vector_ddot_openmp(const double *p1, const double *p2, - int size, int nthreads) -{ + int size, int nthreads) { if (nthreads <= 0) nthreads = ::omp_get_max_threads(); double sum = 0; @@ -28,8 +27,7 @@ double vector_ddot_openmp(const double *p1, const double *p2, } -double vector_ddot_product_pointer16(const double *p1, const double *p2) -{ +double vector_ddot_product_pointer16(const double *p1, const double *p2) { // Branching optimization must be done in a separate function. double sum = 0; @@ -56,8 +54,7 @@ double vector_ddot_product_pointer16(const double *p1, const double *p2) double vector_ddot_openmp_16(const double *p1, const double *p2, - int size, int nthreads) -{ + int size, int nthreads) { if (nthreads <= 0) nthreads = ::omp_get_max_threads(); diff --git a/td3a_cpp/tutorial/experiment_cython_.cpp b/td3a_cpp/tutorial/experiment_cython_.cpp index 7b8a008..3648fad 100644 --- a/td3a_cpp/tutorial/experiment_cython_.cpp +++ b/td3a_cpp/tutorial/experiment_cython_.cpp @@ -1,7 +1,6 @@ #include "experiment_cython_.h" -void filter_dmax(double *p1, int size, double mx) -{ +void filter_dmax(double *p1, int size, double mx) { double * end = p1 + size; for(; p1 != end; ++p1) if (*p1 > mx) @@ -9,16 +8,14 @@ void filter_dmax(double *p1, int size, double mx) } -void filter_dmax2(double *p1, int size, double mx) -{ +void filter_dmax2(double *p1, int size, double mx) { double * end = p1 + size; for(; p1 != end; ++p1) *p1 = *p1 > mx ? mx : *p1; } -void _filter_dmax16(double *&p1, double mx) -{ +void _filter_dmax16(double *&p1, double mx) { *p1 = *p1 > mx ? mx : *p1; ++p1; *p1 = *p1 > mx ? mx : *p1; ++p1; *p1 = *p1 > mx ? mx : *p1; ++p1; @@ -41,8 +38,7 @@ void _filter_dmax16(double *&p1, double mx) } -void filter_dmax16(double *p1, int size, double mx) -{ +void filter_dmax16(double *p1, int size, double mx) { int size16 = size % 16; double * end = p1 + size - size16; for(; p1 != end; ) @@ -53,8 +49,7 @@ void filter_dmax16(double *p1, int size, double mx) } -void _filter_dmax4(double *p1, double mx) -{ +void _filter_dmax4(double *p1, double mx) { *p1 = *p1 > mx ? mx : *p1; ++p1; *p1 = *p1 > mx ? mx : *p1; ++p1; *p1 = *p1 > mx ? mx : *p1; ++p1; @@ -62,8 +57,7 @@ void _filter_dmax4(double *p1, double mx) } -void filter_dmax4(double *p1, int size, double mx) -{ +void filter_dmax4(double *p1, int size, double mx) { int size4 = size % 4; double * end = p1 + size - size4; for(; p1 != end; p1 += 4)