diff --git a/bin/flake8.bat b/bin/flake8.bat
index 8d51465..7de095f 100644
--- a/bin/flake8.bat
+++ b/bin/flake8.bat
@@ -2,8 +2,7 @@
 set current=%~dp0
 set root=%current%..
 cd %root%
-set pythonexe="c:\Python387_x64\python.exe"
-if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
+set pythonexe=python
 
 @echo running 'python -m flake8 td3a_cpp tests examples'
 %pythonexe% -m flake8 td3a_cpp tests examples setup.py doc/conf.py
diff --git a/doc/conf.py b/doc/conf.py
index 87e569a..10ac465 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -134,6 +134,7 @@
     'python': 'https://www.python.org/',
     'py-spy': 'https://github.com/benfred/py-spy',
     'RUST': 'https://www.rust-lang.org/',
+    'scikit-learn': 'https://scikit-learn.org/stable/',
     'sphinx-gallery': 'https://github.com/sphinx-gallery/sphinx-gallery',
     'SSE': 'https://fr.wikipedia.org/wiki/Streaming_SIMD_Extensions',
     'tqdm': 'https://github.com/tqdm/tqdm',
diff --git a/examples/plot_benchmark_dot.py b/examples/plot_benchmark_dot.py
index b1d54e0..4182bac 100644
--- a/examples/plot_benchmark_dot.py
+++ b/examples/plot_benchmark_dot.py
@@ -10,6 +10,12 @@
 to be slower. This example looks into a couple of slower
 implementations.
 
+Compared implementations:
+
+* :func:`pydot <td3a_cpp.tutorial.dotpy.pydot>`
+* `cblas_ddot <https://github.com/sdpython/td3a_cpp/
+  blob/master/td3a_cpp/tutorial/dot_blas_lapack.pyx#L39>`_
+
 .. contents::
     :local:
 """
diff --git a/examples/plot_benchmark_dot_cython.py b/examples/plot_benchmark_dot_cython.py
index 7257ded..ff7cf2e 100644
--- a/examples/plot_benchmark_dot_cython.py
+++ b/examples/plot_benchmark_dot_cython.py
@@ -12,12 +12,24 @@
 the following:
 
 * :func:`dot_product <td3a_cpp.tutorial.dot_cython.dot_product>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/
+  master/td3a_cpp/tutorial/dot_cython.pyx#L15>`_
 * :func:`ddot_cython_array <td3a_cpp.tutorial.dot_cython.ddot_cython_array>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/
+  master/td3a_cpp/tutorial/dot_cython.pyx#L29>`_
 * :func:`ddot_cython_array_optim
   <td3a_cpp.tutorial.dot_cython.ddot_cython_array_optim>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/
+  master/td3a_cpp/tutorial/dot_cython.pyx#L63>`_
 * :func:`ddot_array <td3a_cpp.tutorial.dot_cython.ddot_array>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/
+  master/td3a_cpp/tutorial/dot_cython.pyx#L141>`_
 * :func:`ddot_array_16 <td3a_cpp.tutorial.dot_cython.ddot_array_16>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/
+  master/td3a_cpp/tutorial/dot_cython.pyx#L188>`_
 * :func:`ddot_array_16_sse <td3a_cpp.tutorial.dot_cython.ddot_array_16_sse>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/
+  master/td3a_cpp/tutorial/dot_cython.pyx#L230>`_
 
 .. contents::
     :local:
diff --git a/examples/plot_benchmark_dot_cython_omp.py b/examples/plot_benchmark_dot_cython_omp.py
index cba93bf..7b20192 100644
--- a/examples/plot_benchmark_dot_cython_omp.py
+++ b/examples/plot_benchmark_dot_cython_omp.py
@@ -12,12 +12,20 @@
 the following:
 
 * :func:`ddot_array_16_sse <td3a_cpp.tutorial.dot_cython.ddot_array_16_sse>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/dot_cython.pyx#L230>`_
 * :func:`ddot_cython_array_omp
   <td3a_cpp.tutorial.dot_cython_omp.ddot_cython_array_omp>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/dot_cython_omp.pyx#L47>`_
 * :func:`ddot_array_openmp
   <td3a_cpp.tutorial.dot_cython_omp.ddot_array_openmp>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/dot_cython_omp.pyx#L88>`_
 * :func:`ddot_array_openmp_16
   <td3a_cpp.tutorial.dot_cython_omp.ddot_array_openmp_16>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/dot_cython_omp.pyx#L104>`_
 
 .. contents::
     :local:
diff --git a/examples/plot_benchmark_dot_mul.py b/examples/plot_benchmark_dot_mul.py
index 69c375c..0e56e2d 100644
--- a/examples/plot_benchmark_dot_mul.py
+++ b/examples/plot_benchmark_dot_mul.py
@@ -2,12 +2,18 @@
 
 .. _l-example-mul:
 
-Compares mul implementations
-============================
+Compares matrix multiplication implementations
+==============================================
 
 :epkg:`numpy` has a very fast implementation of
 matrix multiplication. There are many ways to be slower.
 
+Compared implementations:
+
+* :func:`dmul_cython_omp <td3a_cpp.tutorial.mul_cython_omp.dmul_cython_omp>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/mul_cython_omp.pyx#L171>`_
+
 .. contents::
     :local:
 """
@@ -52,7 +58,7 @@
         for n in sets]
 
 res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
-pprint.pprint(dfs[-1].tail(n=2))
+pprint.pprint(res[-1])
 
 
 ##############################
@@ -131,18 +137,23 @@
 cc = concat(dfs)
 cc['N'] = cc['x_name']
 
-fig, ax = plt.subplots(3, 2, figsize=(10, 8))
-cc[~cc.fct.str.contains('-T')].pivot('N', 'fct', 'average').plot(
-    logy=True, logx=True, ax=ax[0, 0])
-cc[~cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot(
+fig, ax = plt.subplots(3, 2, figsize=(10, 8), sharex=True, sharey=True)
+ccnp = cc.fct == 'numpy'
+cct = cc.fct.str.contains('-T')
+cca0 = cc.fct.str.contains('a=0')
+cc[ccnp | (~cct & cca0)].pivot(
+    'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[0, 0])
+cc[ccnp | (~cct & ~cca0)].pivot(
     'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[0, 1])
-cc[cc.fct.str.contains('-T') | (cc.fct == 'numpy')].pivot(
+cc[ccnp | (cct & cca0)].pivot(
     'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 0])
-cc[cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot(
+cc[ccnp | (~cct & ~cca0)].pivot(
     'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 1])
-cc[cc.fct.str.contains('a=0')].pivot('N', 'fct', 'average').plot(
+cc[ccnp | cca0].pivot('N', 'fct', 'average').plot(
+    logy=True, logx=True, ax=ax[2, 0])
+cc[ccnp | ~cca0].pivot('N', 'fct', 'average').plot(
     logy=True, logx=True, ax=ax[2, 1])
-fig.suptitle("Comparison of multiplication implementations")
+fig.suptitle("Comparison of matrix multiplication implementations")
 
 #################################
 # The results depends on the machine, its
diff --git a/examples/plot_benchmark_dot_mul_timeit.py b/examples/plot_benchmark_dot_mul_timeit.py
index 3b43583..eae31f7 100644
--- a/examples/plot_benchmark_dot_mul_timeit.py
+++ b/examples/plot_benchmark_dot_mul_timeit.py
@@ -2,12 +2,29 @@
 
 .. _l-example-mul-timeit:
 
-Compares mul implementations with timeit
-========================================
+Compares matrix multiplication implementations with timeit
+==========================================================
 
 :epkg:`numpy` has a very fast implementation of
 matrix multiplication. There are many ways to be slower.
-The following uses :epkg:`timeit` to compare implementations.
+The following uses :mod:`timeit` to compare implementations.
+
+Compared implementations:
+
+* :func:`multiply_matrix <td3a_cpp.tutorial.td_mul_cython.multiply_matrix>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/td_mul_cython.pyx#L14>`_
+* :func:`c_multiply_matrix <td3a_cpp.tutorial.td_mul_cython.c_multiply_matrix>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/td_mul_cython.pyx#L69>`_
+* :func:`c_multiply_matrix_parallel
+  <td3a_cpp.tutorial.td_mul_cython.c_multiply_matrix_parallel>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/td_mul_cython.pyx#L49>`_
+* :func:`c_multiply_matrix_parallel_transposed
+  <td3a_cpp.tutorial.td_mul_cython.c_multiply_matrix_parallel_transposed>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/td_mul_cython.pyx#L106>`_
 
 .. contents::
     :local:
diff --git a/examples/plot_benchmark_filter.py b/examples/plot_benchmark_filter.py
index 310bf5b..b057b05 100644
--- a/examples/plot_benchmark_filter.py
+++ b/examples/plot_benchmark_filter.py
@@ -11,15 +11,32 @@
 to :epkg:`numpy`.
 
 * :func:`cfilter_dmax <td3a_cpp.tutorial.experiment_cython.cfilter_dmax>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/experiment_cython.pyx#L93>`_
 * :func:`cfilter_dmax2 <td3a_cpp.tutorial.experiment_cython.cfilter_dmax2>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/experiment_cython.pyx#L107>`_
 * :func:`cfilter_dmax4 <td3a_cpp.tutorial.experiment_cython.cfilter_dmax4>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/experiment_cython.pyx#L138>`_
 * :func:`cfilter_dmax16 <td3a_cpp.tutorial.experiment_cython.cfilter_dmax16>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/experiment_cython.pyx#L122>`_
 * :func:`cyfilter_dmax <td3a_cpp.tutorial.experiment_cython.cyfilter_dmax>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/experiment_cython.pyx#L72>`_
 * :func:`filter_dmax_cython
   <td3a_cpp.tutorial.experiment_cython.filter_dmax_cython>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/experiment_cython.pyx#L28>`_
 * :func:`filter_dmax_cython_optim
   <td3a_cpp.tutorial.experiment_cython.filter_dmax_cython_optim>`
-* :func:`pyfilter_dmax <td3a_cpp.tutorial.experiment_cython.pyfilter_dmax>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/experiment_cython.pyx#L43>`_
+* :func:`pyfilter_dmax
+  <td3a_cpp.tutorial.experiment_cython.pyfilter_dmax>`
+  `code <https://github.com/sdpython/td3a_cpp/blob/master/
+  td3a_cpp/tutorial/experiment_cython.pyx#L15>`_
 """
 
 import pprint
diff --git a/examples/plot_long_parallel_process_joblib.py b/examples/plot_long_parallel_process_joblib.py
index bb1b1e1..184659d 100644
--- a/examples/plot_long_parallel_process_joblib.py
+++ b/examples/plot_long_parallel_process_joblib.py
@@ -6,7 +6,7 @@
 ========================================================
 
 Uses processes to parallelize a dot product is not
-a very solution becausep processes do not share memory,
+a very solution because processes do not share memory,
 they need to exchange data. This parallelisation
 is efficient if the ratio *exchanged data / computation time*
 is low. :epkg:`joblib` is used by :epkg:`scikit-learn`.
diff --git a/examples/plot_parallel_process_concurrent.py b/examples/plot_parallel_process_concurrent.py
index b50db81..c977147 100644
--- a/examples/plot_parallel_process_concurrent.py
+++ b/examples/plot_parallel_process_concurrent.py
@@ -6,7 +6,7 @@
 ====================================================================
 
 Uses processes to parallelize a dot product is not
-a very solution becausep processes do not share memory,
+a very solution because processes do not share memory,
 they need to exchange data. This parallelisation
 is efficient if the ratio *exchanged data / computation time*
 is low. This example uses :epkg:`concurrent.futures`.
diff --git a/examples/plot_profile.py b/examples/plot_profile.py
index 2afa1e2..6b803a2 100644
--- a/examples/plot_profile.py
+++ b/examples/plot_profile.py
@@ -11,7 +11,6 @@
 it is usually helpful to call them multiple times
 before the cause becomes significant.
 
-
 .. content::
     :local:
 """
@@ -96,6 +95,8 @@ def f3_blas(va, vb, n=100000):
 
 #######################################
 # See :ref:`l-appendix-example-dot-profile`.
-# We see that :func:`cblas_ddot` and `numpy.dot` uses
+# We see that :func:`cblas_ddot
+# <td3a_cpp.tutorial.cblas_ddot>`
+# and :func:`numpy.dot` uses
 # the same C function but the wrapping is not the same
 # and numpy is more efficient.
diff --git a/td3a_cpp/tutorial/dot_cython_.cpp b/td3a_cpp/tutorial/dot_cython_.cpp
index a6694d3..8da5a23 100644
--- a/td3a_cpp/tutorial/dot_cython_.cpp
+++ b/td3a_cpp/tutorial/dot_cython_.cpp
@@ -6,8 +6,7 @@
 
 #define BYN 16
 
-double vector_ddot_product_pointer16(const double *p1, const double *p2)
-{
+double vector_ddot_product_pointer16(const double *p1, const double *p2) {
     // Branching optimization must be done in a separate function.
     double sum = 0;
     
@@ -33,8 +32,7 @@ double vector_ddot_product_pointer16(const double *p1, const double *p2)
 }
 
 
-double vector_ddot_product_pointer16(const double *p1, const double *p2, int size)
-{
+double vector_ddot_product_pointer16(const double *p1, const double *p2, int size) {
     double sum = 0;
     int i = 0;
     if (size >= BYN) {
@@ -49,8 +47,7 @@ double vector_ddot_product_pointer16(const double *p1, const double *p2, int siz
 }
 
 
-float vector_sdot_product_pointer16(const float *p1, const float *p2)
-{
+float vector_sdot_product_pointer16(const float *p1, const float *p2) {
     // Branching optimization must be done in a separate function.
     float sum = 0;
     
@@ -76,8 +73,7 @@ float vector_sdot_product_pointer16(const float *p1, const float *p2)
 }
 
 
-float vector_sdot_product_pointer16(const float *p1, const float *p2, int size)
-{
+float vector_sdot_product_pointer16(const float *p1, const float *p2, int size) {
     float sum = 0;
     int i = 0;
     if (size >= BYN) {
@@ -103,8 +99,7 @@ float vector_sdot_product_pointer16(const float *p1, const float *p2, int size)
 #include <immintrin.h>  // double double m256d
 
 
-double vector_ddot_product_pointer16_sse(const double *p1, const double *p2)
-{
+double vector_ddot_product_pointer16_sse(const double *p1, const double *p2) {
     __m256d c1 = _mm256_load_pd(p1);
     __m256d c2 = _mm256_load_pd(p2);
     __m256d r1 = _mm256_mul_pd(c1, c2);
@@ -141,8 +136,7 @@ double vector_ddot_product_pointer16_sse(const double *p1, const double *p2)
 
 #include <emmintrin.h>  // for double m128d
 
-double vector_ddot_product_pointer16_sse(const double *p1, const double *p2)
-{
+double vector_ddot_product_pointer16_sse(const double *p1, const double *p2) {
     __m128d c1 = _mm_load_pd(p1);
     __m128d c2 = _mm_load_pd(p2);
     __m128d r1 = _mm_mul_pd(c1, c2);
@@ -207,8 +201,7 @@ double vector_ddot_product_pointer16_sse(const double *p1, const double *p2)
 #endif
 
 
-double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int size)
-{
+double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int size) {
     double sum = 0;
     int i = 0;
     if (size >= BYN) {
@@ -224,8 +217,7 @@ double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int
 
 #include <xmmintrin.h>  // for float m128
 
-float vector_sdot_product_pointer16_sse(const float *p1, const float *p2)
-{
+float vector_sdot_product_pointer16_sse(const float *p1, const float *p2) {
     __m128 c1 = _mm_load_ps(p1);
     __m128 c2 = _mm_load_ps(p2);
     __m128 r1 = _mm_mul_ps(c1, c2);
@@ -258,8 +250,7 @@ float vector_sdot_product_pointer16_sse(const float *p1, const float *p2)
 }
 
 
-float vector_sdot_product_pointer16_sse(const float *p1, const float *p2, int size)
-{
+float vector_sdot_product_pointer16_sse(const float *p1, const float *p2, int size) {
     float sum = 0;
     int i = 0;
     if (size >= BYN) {
diff --git a/td3a_cpp/tutorial/dot_cython_omp_.cpp b/td3a_cpp/tutorial/dot_cython_omp_.cpp
index ffb837e..59eedd4 100644
--- a/td3a_cpp/tutorial/dot_cython_omp_.cpp
+++ b/td3a_cpp/tutorial/dot_cython_omp_.cpp
@@ -14,8 +14,7 @@ int get_omp_max_threads_cpp()
 
 
 double vector_ddot_openmp(const double *p1, const double *p2,
-                          int size, int nthreads)
-{
+                          int size, int nthreads) {
     if (nthreads <= 0)
         nthreads = ::omp_get_max_threads();
     double sum = 0;
@@ -28,8 +27,7 @@ double vector_ddot_openmp(const double *p1, const double *p2,
 }
 
 
-double vector_ddot_product_pointer16(const double *p1, const double *p2)
-{
+double vector_ddot_product_pointer16(const double *p1, const double *p2) {
     // Branching optimization must be done in a separate function.
     double sum = 0;
     
@@ -56,8 +54,7 @@ double vector_ddot_product_pointer16(const double *p1, const double *p2)
 
 
 double vector_ddot_openmp_16(const double *p1, const double *p2,
-                             int size, int nthreads)
-{
+                             int size, int nthreads) {
     if (nthreads <= 0)
         nthreads = ::omp_get_max_threads();
     
diff --git a/td3a_cpp/tutorial/experiment_cython_.cpp b/td3a_cpp/tutorial/experiment_cython_.cpp
index 7b8a008..3648fad 100644
--- a/td3a_cpp/tutorial/experiment_cython_.cpp
+++ b/td3a_cpp/tutorial/experiment_cython_.cpp
@@ -1,7 +1,6 @@
 #include "experiment_cython_.h"
 
-void filter_dmax(double *p1, int size, double mx)
-{
+void filter_dmax(double *p1, int size, double mx) {
     double * end = p1 + size;
     for(; p1 != end; ++p1)
         if (*p1 > mx)
@@ -9,16 +8,14 @@ void filter_dmax(double *p1, int size, double mx)
 }
 
 
-void filter_dmax2(double *p1, int size, double mx)
-{
+void filter_dmax2(double *p1, int size, double mx) {
     double * end = p1 + size;
     for(; p1 != end; ++p1)
         *p1 = *p1 > mx ? mx : *p1;
 }
 
 
-void _filter_dmax16(double *&p1, double mx)
-{
+void _filter_dmax16(double *&p1, double mx) {
     *p1 = *p1 > mx ? mx : *p1; ++p1;
     *p1 = *p1 > mx ? mx : *p1; ++p1;
     *p1 = *p1 > mx ? mx : *p1; ++p1;
@@ -41,8 +38,7 @@ void _filter_dmax16(double *&p1, double mx)
 }
 
 
-void filter_dmax16(double *p1, int size, double mx)
-{
+void filter_dmax16(double *p1, int size, double mx) {
     int size16 = size % 16;
     double * end = p1 + size - size16;
     for(; p1 != end; )
@@ -53,8 +49,7 @@ void filter_dmax16(double *p1, int size, double mx)
 }
 
 
-void _filter_dmax4(double *p1, double mx)
-{
+void _filter_dmax4(double *p1, double mx) {
     *p1 = *p1 > mx ? mx : *p1; ++p1;
     *p1 = *p1 > mx ? mx : *p1; ++p1;
     *p1 = *p1 > mx ? mx : *p1; ++p1;
@@ -62,8 +57,7 @@ void _filter_dmax4(double *p1, double mx)
 }
 
 
-void filter_dmax4(double *p1, int size, double mx)
-{
+void filter_dmax4(double *p1, int size, double mx) {
     int size4 = size % 4;
     double * end = p1 + size - size4;
     for(; p1 != end; p1 += 4)