diff --git a/.gitignore b/.gitignore
index 78bb13d..98c85a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@ td3a_cpp/tutorial/dot_cython.cpp
 td3a_cpp/tutorial/dot_blas_lapack.cpp
 td3a_cpp/tutorial/experiment_cython.cpp
 td3a_cpp/tutorial/dot_cython_omp.cpp
+td3a_cpp/tutorial/mul_cython_omp.cpp
diff --git a/bin/build.bat b/bin/build.bat
index e0ddc18..a872d95 100644
--- a/bin/build.bat
+++ b/bin/build.bat
@@ -5,7 +5,7 @@ cd %root%
 @echo ##################
 @echo Compile
 @echo ##################
-set pythonexe="c:\Python391_x64\python.exe"
+set pythonexe="c:\Python387_x64\python.exe"
 if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
 
 @echo running %root%\setup.py build_ext --inplace
diff --git a/bin/doc.bat b/bin/doc.bat
index b199429..e7057f7 100644
--- a/bin/doc.bat
+++ b/bin/doc.bat
@@ -2,7 +2,7 @@
 set current=%~dp0
 set root=%current%..
 cd %root%
-set pythonexe="c:\Python391_x64\python.exe"
+set pythonexe="c:\Python387_x64\python.exe"
 if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
 
 @echo running 'python -m sphinx -T -b html doc dist/html'
diff --git a/bin/flake8.bat b/bin/flake8.bat
index 2184057..ef1d989 100644
--- a/bin/flake8.bat
+++ b/bin/flake8.bat
@@ -2,7 +2,7 @@
 set current=%~dp0
 set root=%current%..
 cd %root%
-set pythonexe="c:\Python391_x64\python.exe"
+set pythonexe="c:\Python387_x64\python.exe"
 if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
 
 @echo running 'python -m flake8 td3a_cpp tests examples'
diff --git a/bin/unittest.bat b/bin/unittest.bat
index 54576c8..45edd80 100644
--- a/bin/unittest.bat
+++ b/bin/unittest.bat
@@ -2,7 +2,7 @@
 set current=%~dp0
 set root=%current%..
 cd %root%
-set pythonexe="c:\Python391_x64\python.exe"
+set pythonexe="c:\Python387_x64\python.exe"
 if not exist %pythonexe% set pythonexe="c:\Python370_x64\python.exe"
 
 @echo running 'python -m unittest discover tests'
diff --git a/doc/api.rst b/doc/api.rst
index 3d2b2f4..b37adda 100644
--- a/doc/api.rst
+++ b/doc/api.rst
@@ -60,7 +60,6 @@ dot
 
 .. autofunction:: td3a_cpp.tutorial.dot_cython_omp.ddot_array_openmp_16
 
-
 filter
 ^^^^^^
 
@@ -80,3 +79,7 @@ filter
 
 .. autofunction:: td3a_cpp.tutorial.experiment_cython.cfilter_dmax16
 
+matrix multiplication
+^^^^^^^^^^^^^^^^^^^^^
+
+.. autofunction:: td3a_cpp.tutorial.mul_cython_omp.dmul_cython_omp
diff --git a/examples/plot_benchmark_dot_mul.py b/examples/plot_benchmark_dot_mul.py
new file mode 100644
index 0000000..c43e382
--- /dev/null
+++ b/examples/plot_benchmark_dot_mul.py
@@ -0,0 +1,152 @@
+"""
+
+.. _l-example-mul:
+
+Compares mul implementations
+============================
+
+:epkg:`numpy` has a very fast implementation of
+matrix multiplication. There are many ways to be slower.
+
+.. contents::
+    :local:
+"""
+
+import pprint
+import numpy
+from numpy.testing import assert_almost_equal
+import matplotlib.pyplot as plt
+from pandas import DataFrame, concat
+from td3a_cpp.tutorial.mul_cython_omp import dmul_cython_omp
+from td3a_cpp.tools import measure_time_dim
+
+dfs = []
+sets = list(range(2, 145, 20))
+
+##############################
+# numpy mul
+# +++++++++
+#
+
+ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
+             vb=numpy.random.randn(n, n).astype(numpy.float64),
+             mul=lambda x, y: x @ y,
+             x_name=n)
+        for n in sets]
+
+res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
+dfs.append(DataFrame(res))
+dfs[-1]['fct'] = 'numpy'
+pprint.pprint(dfs[-1].tail(n=2))
+
+
+##############################
+# Simple multiplication
+# +++++++++++++++++++++
+#
+
+ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
+             vb=numpy.random.randn(n, n).astype(numpy.float64),
+             mul=dmul_cython_omp,
+             x_name=n)
+        for n in sets]
+
+res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
+pprint.pprint(dfs[-1].tail(n=2))
+
+
+##############################
+# Other scenarios
+# +++++++++++++++
+#
+# 3 differents algorithms, each of them parallelized.
+# See :func:`dmul_cython_omp
+# <td3a_cpp.tutorial.mul_cython_omp.dmul_cython_omp>`.
+
+for algo in range(0, 2):
+    for parallel in (0, 1):
+        print("algo=%d parallel=%d" % (algo, parallel))
+        ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
+                     vb=numpy.random.randn(n, n).astype(numpy.float64),
+                     mul=lambda x, y: dmul_cython_omp(
+                        x, y, algo=algo, parallel=parallel),
+                     x_name=n)
+                for n in sets]
+
+        res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=1))
+        dfs.append(DataFrame(res))
+        dfs[-1]['fct'] = 'a=%d-p=%d' % (algo, parallel)
+        pprint.pprint(dfs[-1].tail(n=2))
+
+########################################
+# One left issue
+# ++++++++++++++
+#
+# Will you find it in :func:`dmul_cython_omp
+# <td3a_cpp.tutorial.mul_cython_omp.dmul_cython_omp>`.
+
+
+va = numpy.random.randn(3, 4).astype(numpy.float64)
+vb = numpy.random.randn(4, 5).astype(numpy.float64)
+numpy_mul = va @ vb
+
+try:
+    for a in range(0, 50):
+        wrong_mul = dmul_cython_omp(va, vb, algo=2, parallel=1)
+        assert_almost_equal(numpy_mul, wrong_mul)
+        print("Iteration %d is Ok" % a)
+    print("All iterations are unexpectedly Ok. Don't push your luck.")
+except AssertionError as e:
+    print(e)
+
+
+##############################
+# Other scenarios but transposed
+# ++++++++++++++++++++++++++++++
+#
+# Same differents algorithms but the second matrix
+# is transposed first: ``b_trans=1``.
+
+
+for algo in range(0, 2):
+    for parallel in (0, 1):
+        print("algo=%d parallel=%d transposed" % (algo, parallel))
+        ctxs = [dict(va=numpy.random.randn(n, n).astype(numpy.float64),
+                     vb=numpy.random.randn(n, n).astype(numpy.float64),
+                     mul=lambda x, y: dmul_cython_omp(
+                        x, y, algo=algo, parallel=parallel, b_trans=1),
+                     x_name=n)
+                for n in sets]
+
+        res = list(measure_time_dim('mul(va, vb)', ctxs, verbose=2))
+        dfs.append(DataFrame(res))
+        dfs[-1]['fct'] = 'a=%d-p=%d-T' % (algo, parallel)
+        pprint.pprint(dfs[-1].tail(n=2))
+
+
+#############################
+# Let's display the results
+# +++++++++++++++++++++++++
+
+cc = concat(dfs)
+cc['N'] = cc['x_name']
+
+fig, ax = plt.subplots(3, 2, figsize=(10, 8))
+cc[~cc.fct.str.contains('-T')].pivot('N', 'fct', 'average').plot(
+    logy=True, logx=True, ax=ax[0, 0])
+cc[~cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot(
+    'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[0, 1])
+cc[cc.fct.str.contains('-T') | (cc.fct == 'numpy')].pivot(
+    'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 0])
+cc[cc.fct.str.contains('-T') & (cc.fct != 'numpy')].pivot(
+    'N', 'fct', 'average').plot(logy=True, logx=True, ax=ax[1, 1])
+cc[cc.fct.str.contains('a=0')].pivot('N', 'fct', 'average').plot(
+    logy=True, logx=True, ax=ax[2, 1])
+fig.suptitle("Comparison of multiplication implementations")
+
+#################################
+# The results depends on the machine, its
+# number of cores, the compilation settings
+# of :epkg:`numpy` or this module.
+
+plt.show()
diff --git a/setup.py b/setup.py
index b41dd94..9cd6808 100644
--- a/setup.py
+++ b/setup.py
@@ -58,7 +58,8 @@ def get_extension_tutorial(name):
     pattern1 = "td3a_cpp.tutorial.%s"
     srcs = ['td3a_cpp/tutorial/%s.pyx' % name]
     args = get_defined_args()
-    if name in ['dot_cython', 'experiment_cython', 'dot_cython_omp']:
+    if name in ['dot_cython', 'experiment_cython', 'dot_cython_omp',
+                'mul_cython_omp']:
         srcs.extend(['td3a_cpp/tutorial/%s_.cpp' % name])
         args['language'] = "c++"
 
@@ -115,7 +116,8 @@ def get_extension_tutorial(name):
 
 ext_modules = []
 for ext in ['dot_blas_lapack', 'dot_cython',
-            'experiment_cython', 'dot_cython_omp']:
+            'experiment_cython', 'dot_cython_omp',
+            'mul_cython_omp']:
     ext_modules.extend(get_extension_tutorial(ext))
 
 
diff --git a/td3a_cpp/tutorial/dot_cython_omp.pyx b/td3a_cpp/tutorial/dot_cython_omp.pyx
index ca8c360..0a0fe1f 100644
--- a/td3a_cpp/tutorial/dot_cython_omp.pyx
+++ b/td3a_cpp/tutorial/dot_cython_omp.pyx
@@ -23,6 +23,9 @@ cdef double _ddot_cython_array_omp(const double[::1] va, const double[::1] vb,
     
     :param va: first vector, dtype must be float64
     :param vb: second vector, dtype must be float64
+    :param chunksize: see :epkg:`prange`
+    :param schedule: 0 no parallelization, 1 for `'static'`,
+        2 for `'dynamic'`
     :return: dot product
     """
     cdef int n = va.shape[0]
@@ -50,7 +53,8 @@ def ddot_cython_array_omp(const double[::1] va, const double[::1] vb,
     :param va: first vector, dtype must be float64
     :param vb: second vector, dtype must be float64
     :param chunksize: see :epkg:`prange`
-    :param schedule: see :epkg:`prange`
+    :param schedule: 0 simple :epkg:`prange`,
+        1 for `'static'`, 2 for `'dynamic'`
     :return: dot product
     """
     if va.shape[0] != vb.shape[0]:        
diff --git a/td3a_cpp/tutorial/mul_cython_omp.pyx b/td3a_cpp/tutorial/mul_cython_omp.pyx
new file mode 100644
index 0000000..56aa77a
--- /dev/null
+++ b/td3a_cpp/tutorial/mul_cython_omp.pyx
@@ -0,0 +1,204 @@
+"""
+Many implementations of the dot product.
+See `Cython documentation <http://docs.cython.org/en/latest/>`_.
+"""
+from cython.parallel import prange
+cimport numpy as cnumpy
+import numpy as pynumpy
+cimport cython
+from cpython cimport array
+import array
+cimport openmp
+cnumpy.import_array()
+
+
+cdef int _dmul_cython_omp(const double* va, const double* vb, double* res,
+                          Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk,
+                          cython.int algo, cython.int parallel) nogil:
+    """
+    matrix multiplication product implemented with cython and C types
+    using :epkg:`prange` (:epkg:`openmp` in :epkg:`cython`).
+
+    :param va: first matrix, dtype must be float64
+    :param vb: second matrix, dtype must be float64
+    :param res: result of the matrix multiplication
+    :param algo: algorithm (see below)
+    :param parallel: kind of parallelization (see below)
+    :return: matrix multiplication
+    """
+    cdef double s
+    cdef Py_ssize_t p, i, j, k
+    if parallel == 0:
+        if algo == 0:
+            for i in range(0, ni):
+                for j in range(0, nj):
+                    s = 0
+                    for k in range(0, nk):
+                        s += va[i * nk + k] * vb[k * nj + j]
+                    res[i * nj + j] = s
+            return 1
+
+        if algo == 1:
+            for j in range(0, nj):
+                for i in range(0, ni):
+                    s = 0
+                    for k in range(0, nk):
+                        s += va[i * nk + k] * vb[k * nj + j]
+                    res[i * nj + j] = s
+            return 1
+
+        if algo == 2:
+            for k in range(0, nk):
+                for i in range(0, ni):
+                    for j in range(0, nj):
+                        res[i * nj + j] += va[i * nk + k] * vb[k * nj + j]
+            return 1
+
+    if parallel == 1:
+        if algo == 0:
+            for p in prange(0, ni):
+                for j in range(0, nj):
+                    res[p * nj + j] = 0
+                    for k in range(0, nk):
+                        res[p * nj + j] += va[p * nk + k] * vb[k * nj + j]
+            return 1
+
+        if algo == 1:
+            for p in prange(0, nj):
+                for i in range(0, ni):
+                    res[i * nj + p] = 0
+                    for k in range(0, nk):
+                        res[i * nj + p] += va[i * nk + k] * vb[k * nj + p]
+            return 1
+
+        if algo == 2:
+            for p in prange(0, nk):
+                for i in range(0, ni):
+                    for j in range(0, nj):
+                        res[i * nj + j] += va[i * nk + p] * vb[p * nj + j]
+            return 1
+    
+    return 0
+
+
+cdef extern from "mul_cython_omp_.h":
+    cdef double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, cython.int size) nogil
+
+
+cdef int _dmul_cython_omp_t(const double* va, const double* vb, double* res,
+                            Py_ssize_t ni, Py_ssize_t nj, Py_ssize_t nk,
+                            cython.int algo, cython.int parallel) nogil:
+    """
+    matrix multiplication product implemented with cython and C types
+    using :epkg:`prange` (:epkg:`openmp` in :epkg:`cython`).
+
+    :param va: first matrix, dtype must be float64
+    :param vb: second matrix, dtype must be float64
+    :param res: result of the matrix multiplication
+    :param algo: algorithm (see below)
+    :param parallel: kind of parallelization (see below)
+    :return: matrix multiplication
+    """
+    cdef double s
+    cdef Py_ssize_t p, i, j, k
+    if parallel == 0:
+        if algo == 0:
+            for i in range(0, ni):
+                for j in range(0, nj):
+                    res[i * nj + j] = vector_ddot_product_pointer16_sse(&va[i * nk], &vb[j * nk], nk)
+            return 1
+
+        if algo == 1:
+            for j in range(0, nj):
+                for i in range(0, ni):
+                    res[i * nj + j] = vector_ddot_product_pointer16_sse(&va[i * nk], &vb[j * nk], nk)
+            return 1
+
+        if algo == 2:
+            for k in range(0, nk):
+                for i in range(0, ni):
+                    for j in range(0, nj):
+                        res[i * nj + j] += va[i * nk + k] * vb[j * nk + k]
+            return 1
+
+    if parallel == 1:
+        if algo == 0:
+            for p in prange(0, ni):
+                for j in range(0, nj):
+                    res[p * nj + j] = vector_ddot_product_pointer16_sse(&va[p * nk], &vb[j * nk], nk)
+                        
+            return 1
+
+        if algo == 1:
+            for p in prange(0, nj):
+                for i in range(0, ni):
+                    res[i * nj + p] = vector_ddot_product_pointer16_sse(&va[i * nk], &vb[p * nk], nk)
+                        
+            return 1
+
+        if algo == 2:
+            for p in prange(0, nk):
+                for i in range(0, ni):
+                    for j in range(0, nj):
+                        res[i * nj + j] += va[i * nk + p] * vb[j * nk + p]
+            return 1
+    
+    return 0
+
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+@cython.wraparound(False)
+def dmul_cython_omp(const double[:, :] va, const double[:, :] vb,
+                    cython.int algo=0, cython.int parallel=0,
+                    cython.int b_trans=0):
+    """
+    matrix multiplication product implemented with cython and C types
+    using :epkg:`prange` (:epkg:`openmp` in :epkg:`cython`).
+
+    :param va: first matrix, dtype must be float64
+    :param vb: second matrix, dtype must be float64
+    :param algo: algorithm (see below)
+    :param parallel: kind of parallelization (see below)
+    :return: matrix multiplication
+    """    
+    cdef double[:, :] pres
+    cdef cython.int r
+    cdef const double *pva
+    cdef const double *pvb
+    cdef double *ppres
+    if b_trans:
+        if va.shape[1] != vb.shape[1]:
+            raise ValueError(
+                "Shape mismatch, cannot multiply %r and %r" % (va.shape, vb.shape))
+        res = pynumpy.zeros((va.shape[0], vb.shape[0]), dtype=pynumpy.double)    
+        pres = res
+        pva = &va[0, 0]
+        pvb = &vb[0, 0]
+        ppres = &pres[0, 0]
+        with nogil:
+            r = _dmul_cython_omp_t(pva, pvb, ppres,
+                                   va.shape[0], vb.shape[0], va.shape[1],
+                                   algo, parallel)
+        if r != 1:
+            raise RuntimeError(
+                "Unknown value for algo=%r or parallel=%d r=%d" % (algo, parallel, r))
+        return res
+    else:
+        if va.shape[1] != vb.shape[0]:
+            raise ValueError(
+                "Shape mismatch, cannot multiply %r and %r" % (va.shape, vb.shape))
+        res = pynumpy.zeros((va.shape[0], vb.shape[1]), dtype=pynumpy.double)    
+        pres = res
+        pva = &va[0, 0]
+        pvb = &vb[0, 0]
+        ppres = &pres[0, 0]
+        with nogil:
+            r = _dmul_cython_omp(pva, pvb, ppres,
+                                 va.shape[0], vb.shape[1], va.shape[1],
+                                 algo, parallel)
+        if r != 1:
+            raise RuntimeError(
+                "Unknown value for algo=%r or parallel=%d r=%r" % (algo, parallel, r))
+        return res
+
diff --git a/td3a_cpp/tutorial/mul_cython_omp_.cpp b/td3a_cpp/tutorial/mul_cython_omp_.cpp
new file mode 100644
index 0000000..8301a81
--- /dev/null
+++ b/td3a_cpp/tutorial/mul_cython_omp_.cpp
@@ -0,0 +1,33 @@
+#include "mul_cython_omp_.h"
+#include <stdio.h>
+#include <emmintrin.h>  // for double m128d
+
+
+double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int size) {
+    if (size == 0)
+        return 0;
+    if (size == 1)
+        return *p1 * *p2;
+    double sum = 0;
+    const double* end = p1 + size;
+    if (size % 2 == 1) {
+        sum += *p1 * *p2;
+        ++p1;
+        ++p2;
+        --size;
+    }
+    __m128d c1 = _mm_loadu_pd(p1);
+    __m128d c2 = _mm_loadu_pd(p2);
+    __m128d r1 = _mm_mul_pd(c1, c2);
+    p1 += 2;
+    p2 += 2;
+    for( ; p1 != end; p1 += 2, p2 += 2) {
+        c1 = _mm_loadu_pd(p1);
+        c2 = _mm_loadu_pd(p2);
+        r1 = _mm_add_pd(r1, _mm_mul_pd(c1, c2));
+    }
+    double r[2];  // r is not necessary aligned.
+    _mm_storeu_pd(r, r1);
+    sum += r[0] + r[1];
+    return sum;
+}
diff --git a/td3a_cpp/tutorial/mul_cython_omp_.h b/td3a_cpp/tutorial/mul_cython_omp_.h
new file mode 100644
index 0000000..5604a62
--- /dev/null
+++ b/td3a_cpp/tutorial/mul_cython_omp_.h
@@ -0,0 +1,13 @@
+#pragma once
+
+
+#if !defined(_CRT_SECURE_NO_WARNINGS)
+#define _CRT_SECURE_NO_WARNINGS
+#define undef__CRT_SECURE_NO_WARNINGS 1
+#endif
+
+double vector_ddot_product_pointer16_sse(const double *p1, const double *p2, int size);
+
+#if defined(undef_CRT_SECURE_NO_WARNINGS)
+#undef _CRT_SECURE_NO_WARNINGS
+#endif
diff --git a/tests/test_tutorial_mul.py b/tests/test_tutorial_mul.py
new file mode 100644
index 0000000..60ba39d
--- /dev/null
+++ b/tests/test_tutorial_mul.py
@@ -0,0 +1,95 @@
+"""
+Unit tests for ``random_strategy``.
+"""
+import unittest
+import numpy
+from numpy.testing import assert_almost_equal
+from td3a_cpp.tutorial.mul_cython_omp import dmul_cython_omp
+
+
+class TestTutorialMul(unittest.TestCase):
+
+    def test_matrix_mul(self):
+        va = numpy.random.randn(3, 4).astype(numpy.float64)
+        vb = numpy.random.randn(4, 5).astype(numpy.float64)
+        res1 = va @ vb
+        res2 = dmul_cython_omp(va, vb)
+        assert_almost_equal(res1, res2)
+
+    def test_matrix_mul_fail(self):
+        va = numpy.random.randn(3, 4).astype(numpy.float64)
+        vb = numpy.random.randn(4, 5).astype(numpy.float64)
+        with self.assertRaises(RuntimeError):
+            dmul_cython_omp(va, vb, algo=4)
+
+    def test_matrix_mul_algo(self):
+        va = numpy.random.randn(3, 4).astype(numpy.float64)
+        vb = numpy.random.randn(4, 5).astype(numpy.float64)
+        res1 = va @ vb
+        for algo in range(0, 3):
+            with self.subTest(algo=algo):
+                res2 = dmul_cython_omp(va, vb, algo=algo)
+                assert_almost_equal(res1, res2)
+
+    def test_matrix_mul_algo_para(self):
+        va = numpy.random.randn(3, 4).astype(numpy.float64)
+        vb = numpy.random.randn(4, 5).astype(numpy.float64)
+        res1 = va @ vb
+        for algo in range(0, 2):
+            with self.subTest(algo=algo):
+                res2 = dmul_cython_omp(va, vb, algo=algo, parallel=1)
+                assert_almost_equal(res1, res2)
+
+    def test_matrix_mul_algo_t(self):
+        va = numpy.random.randn(3, 4).astype(numpy.float64)
+        vb = numpy.random.randn(5, 4).astype(numpy.float64)
+        res1 = va @ vb.T
+        for algo in range(0, 3):
+            with self.subTest(algo=algo):
+                res2 = dmul_cython_omp(va, vb, algo=algo,
+                                       b_trans=1)
+                assert_almost_equal(res1, res2)
+
+    def test_matrix_mul_algo_t_big(self):
+        va = numpy.random.randn(300, 400).astype(numpy.float64)
+        vb = numpy.random.randn(500, 400).astype(numpy.float64)
+        res1 = va @ vb.T
+        for algo in range(0, 3):
+            with self.subTest(algo=algo):
+                res2 = dmul_cython_omp(va, vb, algo=algo,
+                                       b_trans=1)
+                assert_almost_equal(res1, res2)
+
+    def test_matrix_mul_algo_t_big_odd(self):
+        va = numpy.random.randn(30, 41).astype(numpy.float64)
+        vb = numpy.random.randn(50, 41).astype(numpy.float64)
+        res1 = va @ vb.T
+        for algo in range(0, 3):
+            with self.subTest(algo=algo):
+                res2 = dmul_cython_omp(va, vb, algo=algo,
+                                       b_trans=1)
+                assert_almost_equal(res1, res2)
+
+    def test_matrix_mul_algo_para_t(self):
+        va = numpy.random.randn(3, 4).astype(numpy.float64)
+        vb = numpy.random.randn(5, 4).astype(numpy.float64)
+        res1 = va @ vb.T
+        for algo in range(0, 2):
+            with self.subTest(algo=algo):
+                res2 = dmul_cython_omp(va, vb, algo=algo, parallel=1,
+                                       b_trans=1)
+                assert_almost_equal(res1, res2)
+
+    def test_matrix_mul_algo_para_t_big(self):
+        va = numpy.random.randn(300, 400).astype(numpy.float64)
+        vb = numpy.random.randn(500, 400).astype(numpy.float64)
+        res1 = va @ vb.T
+        for algo in range(0, 2):
+            with self.subTest(algo=algo):
+                res2 = dmul_cython_omp(va, vb, algo=algo, parallel=1,
+                                       b_trans=1)
+                assert_almost_equal(res1, res2)
+
+
+if __name__ == '__main__':
+    unittest.main()