fix compilation

sdpython · Jan 18, 2019 · 9f4f201 · 9f4f201
1 parent b807082
commit 9f4f201
Show file tree

Hide file tree

Showing 7 changed files with 2,445 additions and 2,146 deletions.
diff --git a/_doc/notebooks/cbenchmark_branching.ipynb b/_doc/notebooks/cbenchmark_branching.ipynb
diff --git a/_doc/notebooks/cbenchmark_sorted_arrays.ipynb b/_doc/notebooks/cbenchmark_sorted_arrays.ipynb
diff --git a/_doc/sphinxdoc/source/api/numbers.rst b/_doc/sphinxdoc/source/api/numbers.rst
@@ -79,10 +79,10 @@ it calls a function which does the 16 product in one sequence.
 
 .. autosignature:: cpyquickhelper.numbers.vector_dot_product16
 
-The following use :epkg:`AVX` instructions.
+The following use :epkg:`SSE` instructions.
 See documentation on `Intel website <https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=4895,152,3895,3886,3877,5554,5559,5554,152,127,3895,127&text=_mm_add_ps>`_.
 
-.. autosignature:: cpyquickhelper.numbers.vector_dot_product16_avx
+.. autosignature:: cpyquickhelper.numbers.vector_dot_product16_sse
 
 The next one is using AVX instruction with 512 bits.
 

diff --git a/_doc/sphinxdoc/source/conf.py b/_doc/sphinxdoc/source/conf.py
@@ -71,4 +71,5 @@
     'pybind11': 'https://github.com/pybind/pybind11',
     'pythran': 'https://pythran.readthedocs.io/en/latest/',
     'Series': 'https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html',
+    'SSE': 'https://fr.wikipedia.org/wiki/Streaming_SIMD_Extensions',
 })
diff --git a/_unittests/ut_numbers/test_cbenchmark.py b/_unittests/ut_numbers/test_cbenchmark.py
@@ -31,7 +31,8 @@
 from src.cpyquickhelper.numbers.cbenchmark import measure_scenario_G, measure_scenario_H  # pylint: disable=W0611, E0611
 from src.cpyquickhelper.numbers.cbenchmark import measure_scenario_I, measure_scenario_J  # pylint: disable=W0611, E0611
 from src.cpyquickhelper.numbers.cbenchmark import vector_dot_product, empty_vector_dot_product  # pylint: disable=W0611, E0611
-from src.cpyquickhelper.numbers.cbenchmark import vector_dot_product16, vector_dot_product16_avx  # pylint: disable=W0611, E0611
+from src.cpyquickhelper.numbers.cbenchmark import vector_dot_product16, vector_dot_product16_sse  # pylint: disable=W0611, E0611
+from src.cpyquickhelper.numbers.cbenchmark import vector_dot_product16_avx512, get_simd_available_option  # pylint: disable=W0611, E0611
 
 
 class TestCBenchmark(ExtTestCase):
@@ -78,29 +79,37 @@ def test_vector_dot_product(self):
         d1 = numpy.dot(a, b)
         d2 = vector_dot_product(a, b)
         d3 = vector_dot_product16(a, b)
-        d4 = vector_dot_product16_avx(a, b)
-        d5 = empty_vector_dot_product(a, b)
-        self.assertEqual(d5, 0)
-        res = [d1, d2, d3, d4]
+        d4 = vector_dot_product16_sse(a, b)
+        d5 = vector_dot_product16_avx512(a, b)
+        d6 = empty_vector_dot_product(a, b)
+        self.assertEqual(d6, 0)
+        res = [d1, d2, d3, d4, d5]
         self.assertEqual(d1, d2)
         self.assertEqual(d1, d3)
         self.assertEqual(d1, d4)
-        self.assertEqual(len(res), 4)
+        self.assertEqual(len(res), 5)
 
     def test_vector_dot_product18(self):
         a = numpy.array([3, 4, 5] * 6, dtype=numpy.float32)
         b = numpy.array([3.1, 4.1, 5.1] * 6, dtype=numpy.float32)
         d1 = numpy.dot(a, b)
         d2 = vector_dot_product(a, b)
         d3 = vector_dot_product16(a, b)
-        d4 = vector_dot_product16_avx(a, b)
-        d5 = empty_vector_dot_product(a, b)
-        self.assertEqual(d5, 0)
-        res = [d1, d2, d3, d4]
+        d4 = vector_dot_product16_sse(a, b)
+        d5 = vector_dot_product16_avx512(a, b)
+        d6 = empty_vector_dot_product(a, b)
+        self.assertEqual(d6, 0)
+        res = [d1, d2, d3, d4, d5]
         self.assertAlmostEqual(d1, d2, places=4)
         self.assertAlmostEqual(d1, d3, places=4)
         self.assertAlmostEqual(d1, d4, places=4)
-        self.assertEqual(len(res), 4)
+        self.assertAlmostEqual(d1, d5, places=4)
+        self.assertEqual(len(res), 5)
+
+    def test_get_simd_available_option(self):
+        vers = get_simd_available_option()
+        self.assertIn("options", vers)
+        self.assertIn("__SSE__", vers)
 
 
 if __name__ == "__main__":

diff --git a/setup.py b/setup.py
@@ -180,7 +180,7 @@ def __str__(self):
     if sys.platform.startswith("win"):
         libraries_thread = ['kernel32']
         extra_compile_args_thread = None
-        extra_compile_args_numbers = ['/EHsc', '-std=c++11']
+        extra_compile_args_numbers = ['/EHsc']
     elif sys.platform.startswith("darwin"):
         libraries_thread = None
         extra_compile_args_thread = ['-lpthread', '-stdlib=libc++', '-std=c++11',
@@ -190,7 +190,9 @@ def __str__(self):
     else:
         libraries_thread = None
         extra_compile_args_thread = ['-lpthread', '-std=c++11']
-        extra_compile_args_numbers = ['-std=c++11']
+        # option -mavx512f enable AVX 512 instructions
+        # see https://blog.qiqitori.com/?p=390
+        extra_compile_args_numbers = ['-std=c++11', '-mavx512f']
 
     ext_thread = Extension('src.cpyquickhelper.parallel.threader',
                            [os.path.join(root, 'src/cpyquickhelper/parallel/threaderc.cpp'),

diff --git a/src/cpyquickhelper/numbers/cbenchmark.cpp b/src/cpyquickhelper/numbers/cbenchmark.cpp
@@ -10,6 +10,7 @@
 
 namespace py = pybind11;
 
+
 template<typename DTYPE>
 class FunctionMeasureVectorCount : FunctionMeasure
 {
@@ -235,7 +236,7 @@ class FunctionMeasureVectorCountJ : FunctionMeasureVectorCount<DTYPE>
 #endif
 
 
-float vector_dot_product_pointer(const float *p1, const float *p2, int size)
+float vector_dot_product_pointer(const float *p1, const float *p2, size_t size)
 {
     float sum = 0;
     const float * end1 = p1 + size;
@@ -288,13 +289,15 @@ float vector_dot_product_pointer16(const float *p1, const float *p2)
 
 #define BYN 16
 
-float vector_dot_product_pointer16(const float *p1, const float *p2, int size)
+float vector_dot_product_pointer16(const float *p1, const float *p2, size_t size)
 {
     float sum = 0;
-    int i = 0;
-    int size_ = size - BYN;
-    for(; i < size_; i += BYN, p1 += BYN, p2 += BYN)
-        sum += vector_dot_product_pointer16(p1, p2);
+    size_t i = 0;
+    if (size >= BYN) {
+        size_t size_ = size - BYN;
+        for(; i < size_; i += BYN, p1 += BYN, p2 += BYN)
+            sum += vector_dot_product_pointer16(p1, p2);
+    }
     for(; i < size; ++p1, ++p2, ++i)
         sum += *p1 * *p2;
     return sum;
@@ -311,7 +314,7 @@ float vector_dot_product16(py::array_t<float> v1, py::array_t<float> v2)
 
 #include <xmmintrin.h>
 
-float vector_dot_product_pointer16_avx(const float *p1, const float *p2)
+float vector_dot_product_pointer16_sse(const float *p1, const float *p2)
 {
     float sum = 0;
 
@@ -348,29 +351,33 @@ float vector_dot_product_pointer16_avx(const float *p1, const float *p2)
 
 #define BYN 16
 
-float vector_dot_product_pointer16_avx(const float *p1, const float *p2, int size)
+float vector_dot_product_pointer16_sse(const float *p1, const float *p2, size_t size)
 {
     float sum = 0;
-    int i = 0;
-    int size_ = size - BYN;
-    for(; i < size_; i += BYN, p1 += BYN, p2 += BYN)
-        sum += vector_dot_product_pointer16_avx(p1, p2);
+    size_t i = 0;
+    if (size >= BYN) {
+        size_t size_ = size - BYN;
+        for(; i < size_; i += BYN, p1 += BYN, p2 += BYN)
+            sum += vector_dot_product_pointer16_sse(p1, p2);
+    }
     for(; i < size; ++p1, ++p2, ++i)
         sum += *p1 * *p2;
     return sum;
 }
 
-float vector_dot_product16_avx(py::array_t<float> v1, py::array_t<float> v2)
+float vector_dot_product16_sse(py::array_t<float> v1, py::array_t<float> v2)
 {
     if (v1.ndim() != v2.ndim())
         throw std::runtime_error("Vector v1 and v2 must have the same dimension.");
     if (v1.ndim() != 1)
         throw std::runtime_error("Vector v1 and v2 must be vectors.");
-    return vector_dot_product_pointer16_avx(v1.data(0), v2.data(0), v1.shape(0));
+    return vector_dot_product_pointer16_sse(v1.data(0), v2.data(0), v1.shape(0));
 }
 
 #include <immintrin.h>
 
+#if defined(__AVX512F__)
+
 float vector_dot_product_pointer16_avx512(const float *p1, const float *p2)
 {
     float sum = 0;
@@ -390,13 +397,15 @@ float vector_dot_product_pointer16_avx512(const float *p1, const float *p2)
 
 #define BYN 16
 
-float vector_dot_product_pointer16_avx512(const float *p1, const float *p2, int size)
+float vector_dot_product_pointer16_avx512(const float *p1, const float *p2, size_t size)
 {
     float sum = 0;
     int i = 0;
-    int size_ = size - BYN;
-    for(; i < size_; i += BYN, p1 += BYN, p2 += BYN)
-        sum += vector_dot_product_pointer16_avx512(p1, p2);
+    if (size >= BYN) {
+        size_t size_ = size - BYN;
+        for(; i < size_; i += BYN, p1 += BYN, p2 += BYN)
+            sum += vector_dot_product_pointer16_avx512(p1, p2);
+    }
     for(; i < size; ++p1, ++p2, ++i)
         sum += *p1 * *p2;
     return sum;
@@ -411,8 +420,74 @@ float vector_dot_product16_avx512(py::array_t<float> v1, py::array_t<float> v2)
     return vector_dot_product_pointer16_avx512(v1.data(0), v2.data(0), v1.shape(0));
 }
 
+#endif
 
 
+std::string get_simd_available_option()
+{
+    std::string message = "";
+
+#if defined(__SSE__) || (defined(_MSC_VER) && !defined(_M_CEE_PURE))
+    // __SSE__ not defined by Visual Studio.
+    message += " __SSE__";
+#endif
+#if defined(__SSE2__) || (defined(_MSC_VER) && !defined(_M_CEE_PURE))
+    // __SSE2__ not defined by Visual Studio.
+    message += " __SSE2__";
+#endif
+#if defined(__SSE3__) || (defined(_MSC_VER) && !defined(_M_CEE_PURE))
+    // __SSE3__ not defined by Visual Studio.
+    message += " __SSE3__";
+#endif
+#if defined(__SSE4_1__) || (defined(_MSC_VER) && !defined(_M_CEE_PURE))
+    // __SSE4_1__ not defined by Visual Studio.
+    message += " __SSE4_1__";
+#endif
+#if defined(__AVX__)
+    message += " __AVX__";
+#endif
+#if defined(__AVX2__)
+    message += " __AVX2__";
+#endif
+#if defined(__AVX512F__)
+    message += " __AVX512F__";
+#endif
+#if defined(__AVX512DQ__)
+    message += " __AVX512DQ__";
+#endif
+#if defined(__AVX512PF__)
+    message += " __AVX512PF__";
+#endif
+#if defined(__AVX512ER__)
+    message += " __AVX512ER__";
+#endif
+#if defined(__AVX512CD__)
+    message += " __AVX512CD__";
+#endif
+#if defined(__AVX512BW__)
+    message += " __AVX512BW__";
+#endif
+#if defined(__AVX512VL__)
+    message += " __AVX512VL__";
+#endif
+#if defined(__FMA__)
+    message += " __FMA__";
+#endif
+#if defined(__AVX512IFMA__)
+    message += " __AVX512IFMA__";
+#endif
+#if defined(__F16C__)
+    message += " __F16C__";
+#endif
+#if defined(__ARM_NEON__)
+    message += " __ARM_NEON__";
+#endif
+
+    return message.empty() 
+                ? "No available options." 
+                : (std::string("Available options: ") + message);
+}
+
 
 PYBIND11_MODULE(cbenchmark, m) {
 	m.doc() =
@@ -472,8 +547,31 @@ also implemented in C.)pbdoc"
           "Empty measure to have an idea about the processing due to python binding.");
     m.def("vector_dot_product16", &vector_dot_product16,
           "Computes a dot product in C++ with vectors of floats. Goes 16 by 16.");
-    m.def("vector_dot_product16_avx", &vector_dot_product16_avx,
-          "Computes a dot product in C++ with vectors of floats. Goes 16 by 16. Use AVX instructions.");
+    m.def("vector_dot_product16_sse", &vector_dot_product16_sse,
+          "Computes a dot product in C++ with vectors of floats. Goes 16 by 16. Use SSE instructions.");
+
+#if defined(__AVX512F__)          
     m.def("vector_dot_product16_avx512", &vector_dot_product16_avx512,
-          "Computes a dot product in C++ with vectors of floats. Goes 16 by 16. Use AVX 512 instructions.");
+          "Computes a dot product in C++ with vectors of floats. Goes 16 by 16. Use AVX 512 instructions because ``__AVX512F__`` is defined.");
+#else
+    m.def("vector_dot_product16_avx512", &vector_dot_product16_sse,
+          "Computes a dot product in C++ with vectors of floats. Goes 16 by 16. Use SSE instructions because ``__AVX512F__`` is not defined.");
+#endif
+
+    m.def ("get_simd_available_option", &get_simd_available_option,
+        #if defined(__APPLE__)
+           "Returns the available compilation options for SIMD."
+        #else
+            R"pbdoc(Returns the available compilation options for SIMD.
+It can simply be called with the following example:
+
+.. runpython::
+    :showcode:
+    
+    from cpyquickhelper.numbers.cbenchmark.get_simd_available_option
+    print(get_simd_available_option())
+)pbdoc"
+        #endif
+    );
+
 }