Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions bin/flake8.bat
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
set current=%~dp0
set root=%current%..
cd %root%
set pythonexe="c:\Python387_x64\python.exe"
if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
set pythonexe=python

@echo running 'python -m flake8 td3a_cpp tests examples'
%pythonexe% -m flake8 td3a_cpp tests examples setup.py doc/conf.py
Expand Down
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@
'python': 'https://www.python.org/',
'py-spy': 'https://github.com/benfred/py-spy',
'RUST': 'https://www.rust-lang.org/',
'scikit-learn': 'https://scikit-learn.org/stable/',
'sphinx-gallery': 'https://github.com/sphinx-gallery/sphinx-gallery',
'SSE': 'https://fr.wikipedia.org/wiki/Streaming_SIMD_Extensions',
'tqdm': 'https://github.com/tqdm/tqdm',
Expand Down
6 changes: 6 additions & 0 deletions examples/plot_benchmark_dot.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
to be slower. This example looks into a couple of slower
implementations.

Compared implementations:

* :func:`pydot <td3a_cpp.tutorial.dotpy.pydot>`
* `cblas_ddot <https://github.com/sdpython/td3a_cpp/
blob/master/td3a_cpp/tutorial/dot_blas_lapack.pyx#L39>`_

.. contents::
:local:
"""
Expand Down
12 changes: 12 additions & 0 deletions examples/plot_benchmark_dot_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,24 @@
the following:

* :func:`dot_product <td3a_cpp.tutorial.dot_cython.dot_product>`
`code <https://github.com/sdpython/td3a_cpp/blob/
master/td3a_cpp/tutorial/dot_cython.pyx#L15>`_
* :func:`ddot_cython_array <td3a_cpp.tutorial.dot_cython.ddot_cython_array>`
`code <https://github.com/sdpython/td3a_cpp/blob/
master/td3a_cpp/tutorial/dot_cython.pyx#L29>`_
* :func:`ddot_cython_array_optim
<td3a_cpp.tutorial.dot_cython.ddot_cython_array_optim>`
`code <https://github.com/sdpython/td3a_cpp/blob/
master/td3a_cpp/tutorial/dot_cython.pyx#L63>`_
* :func:`ddot_array <td3a_cpp.tutorial.dot_cython.ddot_array>`
`code <https://github.com/sdpython/td3a_cpp/blob/
master/td3a_cpp/tutorial/dot_cython.pyx#L141>`_
* :func:`ddot_array_16 <td3a_cpp.tutorial.dot_cython.ddot_array_16>`
`code <https://github.com/sdpython/td3a_cpp/blob/
master/td3a_cpp/tutorial/dot_cython.pyx#L188>`_
* :func:`ddot_array_16_sse <td3a_cpp.tutorial.dot_cython.ddot_array_16_sse>`
`code <https://github.com/sdpython/td3a_cpp/blob/
master/td3a_cpp/tutorial/dot_cython.pyx#L230>`_

.. contents::
:local:
Expand Down
8 changes: 8 additions & 0 deletions examples/plot_benchmark_dot_cython_omp.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,20 @@
the following:

* :func:`ddot_array_16_sse <td3a_cpp.tutorial.dot_cython.ddot_array_16_sse>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/dot_cython.pyx#L230>`_
* :func:`ddot_cython_array_omp
<td3a_cpp.tutorial.dot_cython_omp.ddot_cython_array_omp>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/dot_cython_omp.pyx#L47>`_
* :func:`ddot_array_openmp
<td3a_cpp.tutorial.dot_cython_omp.ddot_array_openmp>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/dot_cython_omp.pyx#L88>`_
* :func:`ddot_array_openmp_16
<td3a_cpp.tutorial.dot_cython_omp.ddot_array_openmp_16>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/dot_cython_omp.pyx#L104>`_

.. contents::
:local:
Expand Down
33 changes: 22 additions & 11 deletions examples/plot_benchmark_dot_mul.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@

.. _l-example-mul:

Compares mul implementations
============================
Compares matrix multiplication implementations
==============================================

:epkg:`numpy` has a very fast implementation of
matrix multiplication. There are many ways to be slower.

Compared implementations:

* :func:`dmul_cython_omp <td3a_cpp.tutorial.mul_cython_omp.dmul_cython_omp>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/mul_cython_omp.pyx#L171>`_

.. contents::
:local:
"""
Expand Down Expand Up @@ -52,7 +58,7 @@
for n in sets]

res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
pprint.pprint(dfs[-1].tail(n=2))
pprint.pprint(res[-1])


##############################
Expand Down Expand Up @@ -131,18 +137,23 @@
cc = concat(dfs)
cc['N'] = cc['x_name']

fig, ax = plt.subplots(3, 2, figsize=(10, 8))
cc[~cc.fct.str.contains('-T')].pivot('N', 'fct', 'average').plot(
logy=True, logx=True, ax=ax[0, 0])
cc[~cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot(
fig, ax = plt.subplots(3, 2, figsize=(10, 8), sharex=True, sharey=True)
ccnp = cc.fct == 'numpy'
cct = cc.fct.str.contains('-T')
cca0 = cc.fct.str.contains('a=0')
cc[ccnp | (~cct & cca0)].pivot(
'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[0, 0])
cc[ccnp | (~cct & ~cca0)].pivot(
'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[0, 1])
cc[cc.fct.str.contains('-T') | (cc.fct == 'numpy')].pivot(
cc[ccnp | (cct & cca0)].pivot(
'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 0])
cc[cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot(
cc[ccnp | (~cct & ~cca0)].pivot(
'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 1])
cc[cc.fct.str.contains('a=0')].pivot('N', 'fct', 'average').plot(
cc[ccnp | cca0].pivot('N', 'fct', 'average').plot(
logy=True, logx=True, ax=ax[2, 0])
cc[ccnp | ~cca0].pivot('N', 'fct', 'average').plot(
logy=True, logx=True, ax=ax[2, 1])
fig.suptitle("Comparison of multiplication implementations")
fig.suptitle("Comparison of matrix multiplication implementations")

#################################
# The results depends on the machine, its
Expand Down
23 changes: 20 additions & 3 deletions examples/plot_benchmark_dot_mul_timeit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,29 @@

.. _l-example-mul-timeit:

Compares mul implementations with timeit
========================================
Compares matrix multiplication implementations with timeit
==========================================================

:epkg:`numpy` has a very fast implementation of
matrix multiplication. There are many ways to be slower.
The following uses :epkg:`timeit` to compare implementations.
The following uses :mod:`timeit` to compare implementations.

Compared implementations:

* :func:`multiply_matrix <td3a_cpp.tutorial.td_mul_cython.multiply_matrix>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/td_mul_cython.pyx#L14>`_
* :func:`c_multiply_matrix <td3a_cpp.tutorial.td_mul_cython.c_multiply_matrix>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/td_mul_cython.pyx#L69>`_
* :func:`c_multiply_matrix_parallel
<td3a_cpp.tutorial.td_mul_cython.c_multiply_matrix_parallel>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/td_mul_cython.pyx#L49>`_
* :func:`c_multiply_matrix_parallel_transposed
<td3a_cpp.tutorial.td_mul_cython.c_multiply_matrix_parallel_transposed>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/td_mul_cython.pyx#L106>`_

.. contents::
:local:
Expand Down
19 changes: 18 additions & 1 deletion examples/plot_benchmark_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,32 @@
to :epkg:`numpy`.

* :func:`cfilter_dmax <td3a_cpp.tutorial.experiment_cython.cfilter_dmax>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/experiment_cython.pyx#L93>`_
* :func:`cfilter_dmax2 <td3a_cpp.tutorial.experiment_cython.cfilter_dmax2>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/experiment_cython.pyx#L107>`_
* :func:`cfilter_dmax4 <td3a_cpp.tutorial.experiment_cython.cfilter_dmax4>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/experiment_cython.pyx#L138>`_
* :func:`cfilter_dmax16 <td3a_cpp.tutorial.experiment_cython.cfilter_dmax16>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/experiment_cython.pyx#L122>`_
* :func:`cyfilter_dmax <td3a_cpp.tutorial.experiment_cython.cyfilter_dmax>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/experiment_cython.pyx#L72>`_
* :func:`filter_dmax_cython
<td3a_cpp.tutorial.experiment_cython.filter_dmax_cython>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/experiment_cython.pyx#L28>`_
* :func:`filter_dmax_cython_optim
<td3a_cpp.tutorial.experiment_cython.filter_dmax_cython_optim>`
* :func:`pyfilter_dmax <td3a_cpp.tutorial.experiment_cython.pyfilter_dmax>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/experiment_cython.pyx#L43>`_
* :func:`pyfilter_dmax
<td3a_cpp.tutorial.experiment_cython.pyfilter_dmax>`
`code <https://github.com/sdpython/td3a_cpp/blob/master/
td3a_cpp/tutorial/experiment_cython.pyx#L15>`_
"""

import pprint
Expand Down
2 changes: 1 addition & 1 deletion examples/plot_long_parallel_process_joblib.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
========================================================

Uses processes to parallelize a dot product is not
a very solution becausep processes do not share memory,
a very solution because processes do not share memory,
they need to exchange data. This parallelisation
is efficient if the ratio *exchanged data / computation time*
is low. :epkg:`joblib` is used by :epkg:`scikit-learn`.
Expand Down
2 changes: 1 addition & 1 deletion examples/plot_parallel_process_concurrent.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
====================================================================

Uses processes to parallelize a dot product is not
a very solution becausep processes do not share memory,
a very solution because processes do not share memory,
they need to exchange data. This parallelisation
is efficient if the ratio *exchanged data / computation time*
is low. This example uses :epkg:`concurrent.futures`.
Expand Down
5 changes: 3 additions & 2 deletions examples/plot_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
it is usually helpful to call them multiple times
before the cause becomes significant.


.. content::
:local:
"""
Expand Down Expand Up @@ -96,6 +95,8 @@ def f3_blas(va, vb, n=100000):

#######################################
# See :ref:`l-appendix-example-dot-profile`.
# We see that :func:`cblas_ddot` and `numpy.dot` uses
# We see that :func:`cblas_ddot
# <td3a_cpp.tutorial.cblas_ddot>`
# and :func:`numpy.dot` uses
# the same C function but the wrapping is not the same
# and numpy is more efficient.
27 changes: 9 additions & 18 deletions td3a_cpp/tutorial/dot_cython_.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@

#define BYN 16

double vector_ddot_product_pointer16(const double *p1, const double *p2)
{
double vector_ddot_product_pointer16(const double *p1, const double *p2) {
// Branching optimization must be done in a separate function.
double sum = 0;

Expand All @@ -33,8 +32,7 @@ double vector_ddot_product_pointer16(const double *p1, const double *p2)
}


double vector_ddot_product_pointer16(const double *p1, const double *p2, int size)
{
double vector_ddot_product_pointer16(const double *p1, const double *p2, int size) {
double sum = 0;
int i = 0;
if (size >= BYN) {
Expand All @@ -49,8 +47,7 @@ double vector_ddot_product_pointer16(const double *p1, const double *p2, int siz
}


float vector_sdot_product_pointer16(const float *p1, const float *p2)
{
float vector_sdot_product_pointer16(const float *p1, const float *p2) {
// Branching optimization must be done in a separate function.
float sum = 0;

Expand All @@ -76,8 +73,7 @@ float vector_sdot_product_pointer16(const float *p1, const float *p2)
}


float vector_sdot_product_pointer16(const float *p1, const float *p2, int size)
{
float vector_sdot_product_pointer16(const float *p1, const float *p2, int size) {
float sum = 0;
int i = 0;
if (size >= BYN) {
Expand All @@ -103,8 +99,7 @@ float vector_sdot_product_pointer16(const float *p1, const float *p2, int size)
#include <immintrin.h> // double double m256d


double vector_ddot_product_pointer16_sse(const double *p1, const double *p2)
{
double vector_ddot_product_pointer16_sse(const double *p1, const double *p2) {
__m256d c1 = _mm256_load_pd(p1);
__m256d c2 = _mm256_load_pd(p2);
__m256d r1 = _mm256_mul_pd(c1, c2);
Expand Down Expand Up @@ -141,8 +136,7 @@ double vector_ddot_product_pointer16_sse(const double *p1, const double *p2)

#include <emmintrin.h> // for double m128d

double vector_ddot_product_pointer16_sse(const double *p1, const double *p2)
{
double vector_ddot_product_pointer16_sse(const double *p1, const double *p2) {
__m128d c1 = _mm_load_pd(p1);
__m128d c2 = _mm_load_pd(p2);
__m128d r1 = _mm_mul_pd(c1, c2);
Expand Down Expand Up @@ -207,8 +201,7 @@ double vector_ddot_product_pointer16_sse(const double *p1, const double *p2)
#endif


double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int size)
{
double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int size) {
double sum = 0;
int i = 0;
if (size >= BYN) {
Expand All @@ -224,8 +217,7 @@ double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int

#include <xmmintrin.h> // for float m128

float vector_sdot_product_pointer16_sse(const float *p1, const float *p2)
{
float vector_sdot_product_pointer16_sse(const float *p1, const float *p2) {
__m128 c1 = _mm_load_ps(p1);
__m128 c2 = _mm_load_ps(p2);
__m128 r1 = _mm_mul_ps(c1, c2);
Expand Down Expand Up @@ -258,8 +250,7 @@ float vector_sdot_product_pointer16_sse(const float *p1, const float *p2)
}


float vector_sdot_product_pointer16_sse(const float *p1, const float *p2, int size)
{
float vector_sdot_product_pointer16_sse(const float *p1, const float *p2, int size) {
float sum = 0;
int i = 0;
if (size >= BYN) {
Expand Down
9 changes: 3 additions & 6 deletions td3a_cpp/tutorial/dot_cython_omp_.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ int get_omp_max_threads_cpp()


double vector_ddot_openmp(const double *p1, const double *p2,
int size, int nthreads)
{
int size, int nthreads) {
if (nthreads <= 0)
nthreads = ::omp_get_max_threads();
double sum = 0;
Expand All @@ -28,8 +27,7 @@ double vector_ddot_openmp(const double *p1, const double *p2,
}


double vector_ddot_product_pointer16(const double *p1, const double *p2)
{
double vector_ddot_product_pointer16(const double *p1, const double *p2) {
// Branching optimization must be done in a separate function.
double sum = 0;

Expand All @@ -56,8 +54,7 @@ double vector_ddot_product_pointer16(const double *p1, const double *p2)


double vector_ddot_openmp_16(const double *p1, const double *p2,
int size, int nthreads)
{
int size, int nthreads) {
if (nthreads <= 0)
nthreads = ::omp_get_max_threads();

Expand Down
Loading