Merge pull request #4186 into maintenance/0.14.x

BUG: Workaround for SGEMV segfault in Accelerate
scipy · Nov 23, 2014 · 720f091 · 720f091
2 parents 52fb336 + 63d5c2b
commit 720f091
Show file tree

Hide file tree

Showing 6 changed files with 319 additions and 16 deletions.
diff --git a/scipy/_build_utils/_fortran.py b/scipy/_build_utils/_fortran.py
@@ -6,27 +6,34 @@
 
 
 __all__ = ['needs_g77_abi_wrapper', 'split_fortran_files',
-           'get_g77_abi_wrappers']
+           'get_g77_abi_wrappers',
+           'needs_sgemv_fix', 'get_sgemv_fix']
 
 
-def _uses_veclib(info):
-    r_accelerate = re.compile("Accelerate|vecLib")
-
+def uses_veclib(info):
+    if sys.platform != "darwin":
+        return False
+    r_accelerate = re.compile("vecLib")
     extra_link_args = info.get('extra_link_args', '')
     for arg in extra_link_args:
         if r_accelerate.search(arg):
             return True
-
     return False
 
 
 def uses_accelerate(info):
-    return _uses_veclib(info)
+    if sys.platform != "darwin":
+        return False
+    r_accelerate = re.compile("Accelerate")
+    extra_link_args = info.get('extra_link_args', '')
+    for arg in extra_link_args:
+        if r_accelerate.search(arg):
+            return True
+    return False
 
 
 def uses_mkl(info):
     r_mkl = re.compile("mkl_core")
-
     libraries = info.get('libraries', '')
     for library in libraries:
         if r_mkl.search(library):
@@ -36,8 +43,8 @@ def uses_mkl(info):
 
 
 def needs_g77_abi_wrapper(info):
-    """Returns true if g77 ABI wrapper must be used."""
-    if uses_accelerate(info):
+    """Returns True if g77 ABI wrapper must be used."""
+    if uses_accelerate(info) or uses_veclib(info):
         return True
     # XXX: is this really true only on Mac OS X ?
     elif uses_mkl(info) and sys.platform == "darwin":
@@ -78,6 +85,23 @@ def get_g77_abi_wrappers(info):
     return wrapper_sources
 
 
+def needs_sgemv_fix(info):
+    """Returns True if SGEMV must be fixed."""
+    if uses_accelerate(info):
+        return True
+    else:
+        return False
+
+
+def get_sgemv_fix(info):
+    """ Returns source file needed to correct SGEMV """
+    path = os.path.abspath(os.path.dirname(__file__))
+    if needs_sgemv_fix(info):
+        return [os.path.join(path, 'src', 'apple_sgemv_fix.c')]
+    else:
+        return []
+
+
 def split_fortran_files(source_dir, subroutines=None):
     """Split each file in `source_dir` into separate files per subroutine.
 

diff --git a/scipy/_build_utils/src/apple_sgemv_fix.c b/scipy/_build_utils/src/apple_sgemv_fix.c
@@ -0,0 +1,229 @@
+/* This is a collection of ugly hacks to circumvent a bug in 
+ * Apple Accelerate framework's SGEMV subroutine.
+ *
+ * See: https://github.com/numpy/numpy/issues/4007
+ *
+ * SGEMV in Accelerate framework will segfault on MacOS X version 10.9
+ * (aka Mavericks) if arrays are not aligned to 32 byte boundaries
+ * and the CPU supports AVX instructions. This can produce segfaults
+ * in parts of SciPy whcih uses the SGEMV subroutine. As of October
+ * 2014 that is scipy.linalg.blas, SuperLU and ARPACK. This patch 
+ * overshadows the symbols cblas_sgemv, sgemv_ and sgemv exported by 
+ * Accelerate to produce the correct behavior. The MacOS X version and 
+ * CPU specs are checked on module import. If Mavericks and AVX are 
+ * detected the call to SGEMV is emulated with a call to SGEMM if the 
+ * arrays are not 32 byte aligned. If the exported symbols cannot be
+ * overshadowed on module import, a fatal error is produced and the
+ * process aborts. All the fixes are in a self-contained C file. 
+ * The patch is not applied unless SciPy is configured to link with 
+ * Apple's Accelerate framework. 
+ *
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "Python.h"
+#include "numpy/arrayobject.h"
+
+#include <string.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+/* ----------------------------------------------------------------- */
+/* Original cblas_sgemv */
+
+#define VECLIB_FILE "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/vecLib"
+
+enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
+enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
+extern void cblas_xerbla(int info, const char *rout, const char *form, ...);
+
+typedef void cblas_sgemv_t(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *X, const int incX, 
+                 const float beta, float  *Y, const int incY);
+
+typedef void cblas_sgemm_t(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
+                 const int M, const int N, const int K,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *B, const int ldb, 
+                 const float beta, float  *C, const int incC); 
+
+typedef void fortran_sgemv_t( const char* trans, const int* m, const int* n,
+             const float* alpha, const float* A, const int* ldA,
+             const float* X, const int* incX,
+             const float* beta, float* Y, const int* incY );
+
+static void *veclib = NULL;
+static cblas_sgemv_t *accelerate_cblas_sgemv = NULL;
+static cblas_sgemm_t *accelerate_cblas_sgemm = NULL;
+static fortran_sgemv_t *accelerate_sgemv = NULL;
+static int AVX_and_10_9 = 0;
+
+/* Dynamic check for AVX support
+ * __builtin_cpu_supports("avx") is available in gcc 4.8,
+ * but clang and icc do not currently support it. */
+#define cpu_supports_avx()\
+(system("sysctl -n machdep.cpu.features | grep -q AVX") == 0)
+
+/* Check if we are using MacOS X version 10.9 */
+#define using_mavericks()\
+(system("sw_vers -productVersion | grep -q 10\\.9\\.") == 0) 
+
+__attribute__((destructor))
+static void unloadlib(void)
+{
+   if (veclib) dlclose(veclib);
+}
+
+__attribute__((constructor))
+static void loadlib()
+/* automatically executed on module import */
+{
+    char errormsg[1024];
+    int AVX, MAVERICKS;
+    memset((void*)errormsg, 0, sizeof(errormsg));
+    /* check if the CPU supports AVX */
+    AVX = cpu_supports_avx();
+    /* check if the OS is MacOS X Mavericks */
+    MAVERICKS = using_mavericks();    
+    /* we need the workaround when the CPU supports
+     * AVX and the OS version is Mavericks */
+    AVX_and_10_9 = AVX && MAVERICKS;
+    /* load vecLib */
+    veclib = dlopen(VECLIB_FILE, RTLD_LOCAL | RTLD_FIRST);
+    if (!veclib) {
+        veclib = NULL;
+        sprintf(errormsg,"Failed to open vecLib from location '%s'.", VECLIB_FILE);
+        Py_FatalError(errormsg); /* calls abort() and dumps core */
+    }
+    /* resolve Fortran SGEMV from Accelerate */
+    accelerate_sgemv = (fortran_sgemv_t*) dlsym(veclib, "sgemv_");
+    if (!accelerate_sgemv) {
+        unloadlib();
+        sprintf(errormsg,"Failed to resolve symbol 'sgemv_'.");
+        Py_FatalError(errormsg);
+    }
+    /* resolve cblas_sgemv from Accelerate */
+    accelerate_cblas_sgemv = (cblas_sgemv_t*) dlsym(veclib, "cblas_sgemv");
+    if (!accelerate_cblas_sgemv) {
+        unloadlib();
+        sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemv'.");
+        Py_FatalError(errormsg);
+    }
+    /* resolve cblas_sgemm from Accelerate */
+    accelerate_cblas_sgemm = (cblas_sgemm_t*) dlsym(veclib, "cblas_sgemm");
+    if (!accelerate_cblas_sgemm) {
+        unloadlib();
+        sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemm'.");
+        Py_FatalError(errormsg);
+    }
+}
+
+/* ----------------------------------------------------------------- */
+/* Fortran SGEMV override */
+
+void sgemv_( const char* trans, const int* m, const int* n,
+             const float* alpha, const float* A, const int* ldA,
+             const float* X, const int* incX,
+             const float* beta, float* Y, const int* incY )
+{    
+    /* It is safe to use the original SGEMV if we are not using AVX on Mavericks
+     * or the input arrays A, X and Y are all aligned on 32 byte boundaries. */
+    #define BADARRAY(x) (((npy_intp)(void*)x) % 32)
+    const int use_sgemm = AVX_and_10_9 && (BADARRAY(A) || BADARRAY(X) || BADARRAY(Y));
+    if (!use_sgemm) {
+        accelerate_sgemv(trans,m,n,alpha,A,ldA,X,incX,beta,Y,incY);
+        return;
+    }
+
+    /* Arrays are misaligned, the CPU supports AVX, and we are running 
+     * Mavericks.
+     *
+     * Emulation of SGEMV with SGEMM: 
+     *
+     * SGEMV allows vectors to be strided. SGEMM requires all arrays to be
+     * contiguous along the leading dimension. To emulate striding in SGEMV
+     * with the leading dimension arguments in SGEMM we compute
+     *
+     *    Y = alpha * op(A) @ X + beta * Y
+     *
+     * as
+     *
+     *    Y.T = alpha * X.T @ op(A).T + beta * Y.T
+     *
+     * Because Fortran uses column major order and X.T and Y.T are row vectors,
+     * the leading dimensions of X.T and Y.T in SGEMM become equal to the 
+     * strides of the the column vectors X and Y in SGEMV. */
+
+    switch (*trans) {
+        case 'T':
+        case 't':
+        case 'C':
+        case 'c':
+            accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
+                1, *n, *m, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
+            break;
+        case 'N':
+        case 'n':
+            accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans,
+                1, *m, *n, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
+            break;
+        default:
+            cblas_xerbla(1, "SGEMV", "Illegal transpose setting: %c\n", *trans);
+    }
+}
+
+/* ----------------------------------------------------------------- */
+/* Override for an alias symbol for sgemv_ in Accelerate */
+
+void sgemv (char *trans,
+            const int *m, const int *n,
+            const float *alpha,
+            const float *A, const int *lda,
+            const float *B, const int *incB,
+            const float *beta,
+            float *C, const int *incC)
+{
+    sgemv_(trans,m,n,alpha,A,lda,B,incB,beta,C,incC);
+}
+
+/* ----------------------------------------------------------------- */
+/* cblas_sgemv override, based on Netlib CBLAS code */
+
+void cblas_sgemv(const enum CBLAS_ORDER order,
+                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+                 const float alpha, const float  *A, const int lda,
+                 const float  *X, const int incX, const float beta,
+                 float  *Y, const int incY)
+{
+   char TA;
+   if (order == CblasColMajor)
+   {
+      if (TransA == CblasNoTrans) TA = 'N';
+      else if (TransA == CblasTrans) TA = 'T';
+      else if (TransA == CblasConjTrans) TA = 'C';
+      else 
+      {
+         cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA);
+      }
+      sgemv_(&TA, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+   }
+   else if (order == CblasRowMajor)
+   {
+      if (TransA == CblasNoTrans) TA = 'T';
+      else if (TransA == CblasTrans) TA = 'N';
+      else if (TransA == CblasConjTrans) TA = 'N';
+      else
+      {
+         cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA);
+         return;
+      }
+      sgemv_(&TA, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
+   }
+   else
+      cblas_xerbla(1, "cblas_sgemv", "Illegal Order setting, %d\n", order);
+}
+
diff --git a/scipy/linalg/setup.py b/scipy/linalg/setup.py
@@ -8,7 +8,7 @@
 def configuration(parent_package='',top_path=None):
     from numpy.distutils.system_info import get_info, NotFoundError
     from numpy.distutils.misc_util import Configuration
-    from scipy._build_utils import get_g77_abi_wrappers, split_fortran_files
+    from scipy._build_utils import get_sgemv_fix, get_g77_abi_wrappers, split_fortran_files
 
     config = Configuration('linalg',parent_package,top_path)
 
@@ -25,6 +25,7 @@ def configuration(parent_package='',top_path=None):
     # fblas:
     sources = ['fblas.pyf.src']
     sources += get_g77_abi_wrappers(lapack_opt)
+    sources += get_sgemv_fix(lapack_opt)
 
     config.add_extension('_fblas',
                          sources=sources,

diff --git a/scipy/linalg/tests/test_fblas.py b/scipy/linalg/tests/test_fblas.py
@@ -15,7 +15,7 @@
 from scipy.lib.six import xrange
 
 from numpy.testing import TestCase, run_module_suite, assert_array_equal, \
-    assert_array_almost_equal, assert_
+    assert_allclose, assert_array_almost_equal, assert_
 
 
 # decimal accuracy to require between Python and LAPACK/BLAS calculations
@@ -463,6 +463,49 @@ def test_y_stride_assert(self):
     class TestSgemv(TestCase, BaseGemv):
         blas_func = fblas.sgemv
         dtype = float32
+
+        def test_sgemv_on_osx(self):
+            from itertools import product
+            import sys
+            import numpy as np
+
+            if sys.platform != 'darwin':
+                return
+
+            def aligned_array(shape, align, dtype, order='C'):
+                # Make array shape `shape` with aligned at `align` bytes
+                d = dtype()
+                # Make array of correct size with `align` extra bytes
+                N = np.prod(shape)
+                tmp = np.zeros(N * d.nbytes + align, dtype=np.uint8)
+                address = tmp.__array_interface__["data"][0]
+                # Find offset into array giving desired alignment
+                for offset in range(align):
+                    if (address + offset) % align == 0: 
+                        break
+                tmp = tmp[offset:offset+N*d.nbytes].view(dtype=dtype)
+                return tmp.reshape(shape, order=order)
+
+            def as_aligned(arr, align, dtype, order='C'):
+                # Copy `arr` into an aligned array with same shape
+                aligned = aligned_array(arr.shape, align, dtype, order)
+                aligned[:] = arr[:]
+                return aligned
+
+            def assert_dot_close(A, X, desired):
+                assert_allclose(self.blas_func(1.0,A,X), desired,
+                    rtol=1e-5, atol=1e-7)
+
+            testdata = product((15,32), (10000,), (200,89), ('C','F'))
+            for align, m, n, a_order in testdata:
+                A_d = np.random.rand(m, n)
+                X_d = np.random.rand(n)
+                desired = np.dot(A_d, X_d)
+                # Calculation with aligned single precision
+                A_f = as_aligned(A_d, align, np.float32, order=a_order)
+                X_f = as_aligned(X_d, align, np.float32, order=a_order)
+                assert_dot_close(A_f, X_f, desired)
+
 except AttributeError:
     class TestSgemv:
         pass