Skip to content
This repository has been archived by the owner on Jan 13, 2024. It is now read-only.

Commit

Permalink
fix compilation
Browse files Browse the repository at this point in the history
  • Loading branch information
sdpython committed Jan 18, 2019
1 parent b807082 commit 9f4f201
Show file tree
Hide file tree
Showing 7 changed files with 2,445 additions and 2,146 deletions.
2,299 changes: 2,299 additions & 0 deletions _doc/notebooks/cbenchmark_branching.ipynb

Large diffs are not rendered by default.

2,110 changes: 0 additions & 2,110 deletions _doc/notebooks/cbenchmark_sorted_arrays.ipynb

This file was deleted.

4 changes: 2 additions & 2 deletions _doc/sphinxdoc/source/api/numbers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,10 @@ it calls a function which does the 16 product in one sequence.

.. autosignature:: cpyquickhelper.numbers.vector_dot_product16

The following use :epkg:`AVX` instructions.
The following use :epkg:`SSE` instructions.
See documentation on `Intel website <https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=4895,152,3895,3886,3877,5554,5559,5554,152,127,3895,127&text=_mm_add_ps>`_.

.. autosignature:: cpyquickhelper.numbers.vector_dot_product16_avx
.. autosignature:: cpyquickhelper.numbers.vector_dot_product16_sse

The next one is using AVX instruction with 512 bits.

Expand Down
1 change: 1 addition & 0 deletions _doc/sphinxdoc/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,5 @@
'pybind11': 'https://github.com/pybind/pybind11',
'pythran': 'https://pythran.readthedocs.io/en/latest/',
'Series': 'https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html',
'SSE': 'https://fr.wikipedia.org/wiki/Streaming_SIMD_Extensions',
})
31 changes: 20 additions & 11 deletions _unittests/ut_numbers/test_cbenchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
from src.cpyquickhelper.numbers.cbenchmark import measure_scenario_G, measure_scenario_H # pylint: disable=W0611, E0611
from src.cpyquickhelper.numbers.cbenchmark import measure_scenario_I, measure_scenario_J # pylint: disable=W0611, E0611
from src.cpyquickhelper.numbers.cbenchmark import vector_dot_product, empty_vector_dot_product # pylint: disable=W0611, E0611
from src.cpyquickhelper.numbers.cbenchmark import vector_dot_product16, vector_dot_product16_avx # pylint: disable=W0611, E0611
from src.cpyquickhelper.numbers.cbenchmark import vector_dot_product16, vector_dot_product16_sse # pylint: disable=W0611, E0611
from src.cpyquickhelper.numbers.cbenchmark import vector_dot_product16_avx512, get_simd_available_option # pylint: disable=W0611, E0611


class TestCBenchmark(ExtTestCase):
Expand Down Expand Up @@ -78,29 +79,37 @@ def test_vector_dot_product(self):
d1 = numpy.dot(a, b)
d2 = vector_dot_product(a, b)
d3 = vector_dot_product16(a, b)
d4 = vector_dot_product16_avx(a, b)
d5 = empty_vector_dot_product(a, b)
self.assertEqual(d5, 0)
res = [d1, d2, d3, d4]
d4 = vector_dot_product16_sse(a, b)
d5 = vector_dot_product16_avx512(a, b)
d6 = empty_vector_dot_product(a, b)
self.assertEqual(d6, 0)
res = [d1, d2, d3, d4, d5]
self.assertEqual(d1, d2)
self.assertEqual(d1, d3)
self.assertEqual(d1, d4)
self.assertEqual(len(res), 4)
self.assertEqual(len(res), 5)

def test_vector_dot_product18(self):
a = numpy.array([3, 4, 5] * 6, dtype=numpy.float32)
b = numpy.array([3.1, 4.1, 5.1] * 6, dtype=numpy.float32)
d1 = numpy.dot(a, b)
d2 = vector_dot_product(a, b)
d3 = vector_dot_product16(a, b)
d4 = vector_dot_product16_avx(a, b)
d5 = empty_vector_dot_product(a, b)
self.assertEqual(d5, 0)
res = [d1, d2, d3, d4]
d4 = vector_dot_product16_sse(a, b)
d5 = vector_dot_product16_avx512(a, b)
d6 = empty_vector_dot_product(a, b)
self.assertEqual(d6, 0)
res = [d1, d2, d3, d4, d5]
self.assertAlmostEqual(d1, d2, places=4)
self.assertAlmostEqual(d1, d3, places=4)
self.assertAlmostEqual(d1, d4, places=4)
self.assertEqual(len(res), 4)
self.assertAlmostEqual(d1, d5, places=4)
self.assertEqual(len(res), 5)

def test_get_simd_available_option(self):
vers = get_simd_available_option()
self.assertIn("options", vers)
self.assertIn("__SSE__", vers)


if __name__ == "__main__":
Expand Down
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def __str__(self):
if sys.platform.startswith("win"):
libraries_thread = ['kernel32']
extra_compile_args_thread = None
extra_compile_args_numbers = ['/EHsc', '-std=c++11']
extra_compile_args_numbers = ['/EHsc']
elif sys.platform.startswith("darwin"):
libraries_thread = None
extra_compile_args_thread = ['-lpthread', '-stdlib=libc++', '-std=c++11',
Expand All @@ -190,7 +190,9 @@ def __str__(self):
else:
libraries_thread = None
extra_compile_args_thread = ['-lpthread', '-std=c++11']
extra_compile_args_numbers = ['-std=c++11']
# option -mavx512f enable AVX 512 instructions
# see https://blog.qiqitori.com/?p=390
extra_compile_args_numbers = ['-std=c++11', '-mavx512f']

ext_thread = Extension('src.cpyquickhelper.parallel.threader',
[os.path.join(root, 'src/cpyquickhelper/parallel/threaderc.cpp'),
Expand Down
140 changes: 119 additions & 21 deletions src/cpyquickhelper/numbers/cbenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

namespace py = pybind11;


template<typename DTYPE>
class FunctionMeasureVectorCount : FunctionMeasure
{
Expand Down Expand Up @@ -235,7 +236,7 @@ class FunctionMeasureVectorCountJ : FunctionMeasureVectorCount<DTYPE>
#endif


float vector_dot_product_pointer(const float *p1, const float *p2, int size)
float vector_dot_product_pointer(const float *p1, const float *p2, size_t size)
{
float sum = 0;
const float * end1 = p1 + size;
Expand Down Expand Up @@ -288,13 +289,15 @@ float vector_dot_product_pointer16(const float *p1, const float *p2)

#define BYN 16

float vector_dot_product_pointer16(const float *p1, const float *p2, int size)
float vector_dot_product_pointer16(const float *p1, const float *p2, size_t size)
{
float sum = 0;
int i = 0;
int size_ = size - BYN;
for(; i < size_; i += BYN, p1 += BYN, p2 += BYN)
sum += vector_dot_product_pointer16(p1, p2);
size_t i = 0;
if (size >= BYN) {
size_t size_ = size - BYN;
for(; i < size_; i += BYN, p1 += BYN, p2 += BYN)
sum += vector_dot_product_pointer16(p1, p2);
}
for(; i < size; ++p1, ++p2, ++i)
sum += *p1 * *p2;
return sum;
Expand All @@ -311,7 +314,7 @@ float vector_dot_product16(py::array_t<float> v1, py::array_t<float> v2)

#include <xmmintrin.h>

float vector_dot_product_pointer16_avx(const float *p1, const float *p2)
float vector_dot_product_pointer16_sse(const float *p1, const float *p2)
{
float sum = 0;

Expand Down Expand Up @@ -348,29 +351,33 @@ float vector_dot_product_pointer16_avx(const float *p1, const float *p2)

#define BYN 16

float vector_dot_product_pointer16_avx(const float *p1, const float *p2, int size)
float vector_dot_product_pointer16_sse(const float *p1, const float *p2, size_t size)
{
float sum = 0;
int i = 0;
int size_ = size - BYN;
for(; i < size_; i += BYN, p1 += BYN, p2 += BYN)
sum += vector_dot_product_pointer16_avx(p1, p2);
size_t i = 0;
if (size >= BYN) {
size_t size_ = size - BYN;
for(; i < size_; i += BYN, p1 += BYN, p2 += BYN)
sum += vector_dot_product_pointer16_sse(p1, p2);
}
for(; i < size; ++p1, ++p2, ++i)
sum += *p1 * *p2;
return sum;
}

float vector_dot_product16_avx(py::array_t<float> v1, py::array_t<float> v2)
float vector_dot_product16_sse(py::array_t<float> v1, py::array_t<float> v2)
{
if (v1.ndim() != v2.ndim())
throw std::runtime_error("Vector v1 and v2 must have the same dimension.");
if (v1.ndim() != 1)
throw std::runtime_error("Vector v1 and v2 must be vectors.");
return vector_dot_product_pointer16_avx(v1.data(0), v2.data(0), v1.shape(0));
return vector_dot_product_pointer16_sse(v1.data(0), v2.data(0), v1.shape(0));
}

#include <immintrin.h>

#if defined(__AVX512F__)

float vector_dot_product_pointer16_avx512(const float *p1, const float *p2)
{
float sum = 0;
Expand All @@ -390,13 +397,15 @@ float vector_dot_product_pointer16_avx512(const float *p1, const float *p2)

#define BYN 16

float vector_dot_product_pointer16_avx512(const float *p1, const float *p2, int size)
float vector_dot_product_pointer16_avx512(const float *p1, const float *p2, size_t size)
{
float sum = 0;
int i = 0;
int size_ = size - BYN;
for(; i < size_; i += BYN, p1 += BYN, p2 += BYN)
sum += vector_dot_product_pointer16_avx512(p1, p2);
if (size >= BYN) {
size_t size_ = size - BYN;
for(; i < size_; i += BYN, p1 += BYN, p2 += BYN)
sum += vector_dot_product_pointer16_avx512(p1, p2);
}
for(; i < size; ++p1, ++p2, ++i)
sum += *p1 * *p2;
return sum;
Expand All @@ -411,8 +420,74 @@ float vector_dot_product16_avx512(py::array_t<float> v1, py::array_t<float> v2)
return vector_dot_product_pointer16_avx512(v1.data(0), v2.data(0), v1.shape(0));
}

#endif


std::string get_simd_available_option()
{
std::string message = "";

#if defined(__SSE__) || (defined(_MSC_VER) && !defined(_M_CEE_PURE))
// __SSE__ not defined by Visual Studio.
message += " __SSE__";
#endif
#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_CEE_PURE))
// __SSE2__ not defined by Visual Studio.
message += " __SSE2__";
#endif
#if defined(__SSE3__) || (defined(_MSC_VER) && !defined(_M_CEE_PURE))
// __SSE3__ not defined by Visual Studio.
message += " __SSE3__";
#endif
#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_CEE_PURE))
// __SSE4_1__ not defined by Visual Studio.
message += " __SSE4_1__";
#endif
#if defined(__AVX__)
message += " __AVX__";
#endif
#if defined(__AVX2__)
message += " __AVX2__";
#endif
#if defined(__AVX512F__)
message += " __AVX512F__";
#endif
#if defined(__AVX512DQ__)
message += " __AVX512DQ__";
#endif
#if defined(__AVX512PF__)
message += " __AVX512PF__";
#endif
#if defined(__AVX512ER__)
message += " __AVX512ER__";
#endif
#if defined(__AVX512CD__)
message += " __AVX512CD__";
#endif
#if defined(__AVX512BW__)
message += " __AVX512BW__";
#endif
#if defined(__AVX512VL__)
message += " __AVX512VL__";
#endif
#if defined(__FMA__)
message += " __FMA__";
#endif
#if defined(__AVX512IFMA__)
message += " __AVX512IFMA__";
#endif
#if defined(__F16C__)
message += " __F16C__";
#endif
#if defined(__ARM_NEON__)
message += " __ARM_NEON__";
#endif

return message.empty()
? "No available options."
: (std::string("Available options: ") + message);
}


PYBIND11_MODULE(cbenchmark, m) {
m.doc() =
Expand Down Expand Up @@ -472,8 +547,31 @@ also implemented in C.)pbdoc"
"Empty measure to have an idea about the processing due to python binding.");
m.def("vector_dot_product16", &vector_dot_product16,
"Computes a dot product in C++ with vectors of floats. Goes 16 by 16.");
m.def("vector_dot_product16_avx", &vector_dot_product16_avx,
"Computes a dot product in C++ with vectors of floats. Goes 16 by 16. Use AVX instructions.");
m.def("vector_dot_product16_sse", &vector_dot_product16_sse,
"Computes a dot product in C++ with vectors of floats. Goes 16 by 16. Use SSE instructions.");

#if defined(__AVX512F__)
m.def("vector_dot_product16_avx512", &vector_dot_product16_avx512,
"Computes a dot product in C++ with vectors of floats. Goes 16 by 16. Use AVX 512 instructions.");
"Computes a dot product in C++ with vectors of floats. Goes 16 by 16. Use AVX 512 instructions because ``__AVX512F__`` is defined.");
#else
m.def("vector_dot_product16_avx512", &vector_dot_product16_sse,
"Computes a dot product in C++ with vectors of floats. Goes 16 by 16. Use SSE instructions because ``__AVX512F__`` is not defined.");
#endif

m.def ("get_simd_available_option", &get_simd_available_option,
#if defined(__APPLE__)
"Returns the available compilation options for SIMD."
#else
R"pbdoc(Returns the available compilation options for SIMD.
It can simply be called with the following example:
.. runpython::
:showcode:
from cpyquickhelper.numbers.cbenchmark.get_simd_available_option
print(get_simd_available_option())
)pbdoc"
#endif
);

}

0 comments on commit 9f4f201

Please sign in to comment.