Skip to content

Commit

Permalink
Merge pull request #4186 into maintenance/0.14.x
Browse files Browse the repository at this point in the history
BUG: Workaround for SGEMV segfault in Accelerate
  • Loading branch information
pv committed Nov 23, 2014
2 parents 52fb336 + 63d5c2b commit 720f091
Show file tree
Hide file tree
Showing 6 changed files with 319 additions and 16 deletions.
42 changes: 33 additions & 9 deletions scipy/_build_utils/_fortran.py
Expand Up @@ -6,27 +6,34 @@


__all__ = ['needs_g77_abi_wrapper', 'split_fortran_files',
'get_g77_abi_wrappers']
'get_g77_abi_wrappers',
'needs_sgemv_fix', 'get_sgemv_fix']


def _uses_veclib(info):
r_accelerate = re.compile("Accelerate|vecLib")

def uses_veclib(info):
if sys.platform != "darwin":
return False
r_accelerate = re.compile("vecLib")
extra_link_args = info.get('extra_link_args', '')
for arg in extra_link_args:
if r_accelerate.search(arg):
return True

return False


def uses_accelerate(info):
return _uses_veclib(info)
if sys.platform != "darwin":
return False
r_accelerate = re.compile("Accelerate")
extra_link_args = info.get('extra_link_args', '')
for arg in extra_link_args:
if r_accelerate.search(arg):
return True
return False


def uses_mkl(info):
r_mkl = re.compile("mkl_core")

libraries = info.get('libraries', '')
for library in libraries:
if r_mkl.search(library):
Expand All @@ -36,8 +43,8 @@ def uses_mkl(info):


def needs_g77_abi_wrapper(info):
"""Returns true if g77 ABI wrapper must be used."""
if uses_accelerate(info):
"""Returns True if g77 ABI wrapper must be used."""
if uses_accelerate(info) or uses_veclib(info):
return True
# XXX: is this really true only on Mac OS X ?
elif uses_mkl(info) and sys.platform == "darwin":
Expand Down Expand Up @@ -78,6 +85,23 @@ def get_g77_abi_wrappers(info):
return wrapper_sources


def needs_sgemv_fix(info):
"""Returns True if SGEMV must be fixed."""
if uses_accelerate(info):
return True
else:
return False


def get_sgemv_fix(info):
""" Returns source file needed to correct SGEMV """
path = os.path.abspath(os.path.dirname(__file__))
if needs_sgemv_fix(info):
return [os.path.join(path, 'src', 'apple_sgemv_fix.c')]
else:
return []


def split_fortran_files(source_dir, subroutines=None):
"""Split each file in `source_dir` into separate files per subroutine.
Expand Down
229 changes: 229 additions & 0 deletions scipy/_build_utils/src/apple_sgemv_fix.c
@@ -0,0 +1,229 @@
/* This is a collection of ugly hacks to circumvent a bug in
* Apple Accelerate framework's SGEMV subroutine.
*
* See: https://github.com/numpy/numpy/issues/4007
*
* SGEMV in Accelerate framework will segfault on MacOS X version 10.9
* (aka Mavericks) if arrays are not aligned to 32 byte boundaries
* and the CPU supports AVX instructions. This can produce segfaults
* in parts of SciPy whcih uses the SGEMV subroutine. As of October
* 2014 that is scipy.linalg.blas, SuperLU and ARPACK. This patch
* overshadows the symbols cblas_sgemv, sgemv_ and sgemv exported by
* Accelerate to produce the correct behavior. The MacOS X version and
* CPU specs are checked on module import. If Mavericks and AVX are
* detected the call to SGEMV is emulated with a call to SGEMM if the
* arrays are not 32 byte aligned. If the exported symbols cannot be
* overshadowed on module import, a fatal error is produced and the
* process aborts. All the fixes are in a self-contained C file.
* The patch is not applied unless SciPy is configured to link with
* Apple's Accelerate framework.
*
*/

#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#include "Python.h"
#include "numpy/arrayobject.h"

#include <string.h>
#include <dlfcn.h>
#include <stdlib.h>
#include <stdio.h>

/* ----------------------------------------------------------------- */
/* Original cblas_sgemv */

#define VECLIB_FILE "/System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/vecLib"

enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113};
extern void cblas_xerbla(int info, const char *rout, const char *form, ...);

typedef void cblas_sgemv_t(const enum CBLAS_ORDER order,
const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
const float alpha, const float *A, const int lda,
const float *X, const int incX,
const float beta, float *Y, const int incY);

typedef void cblas_sgemm_t(const enum CBLAS_ORDER order,
const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB,
const int M, const int N, const int K,
const float alpha, const float *A, const int lda,
const float *B, const int ldb,
const float beta, float *C, const int incC);

typedef void fortran_sgemv_t( const char* trans, const int* m, const int* n,
const float* alpha, const float* A, const int* ldA,
const float* X, const int* incX,
const float* beta, float* Y, const int* incY );

static void *veclib = NULL;
static cblas_sgemv_t *accelerate_cblas_sgemv = NULL;
static cblas_sgemm_t *accelerate_cblas_sgemm = NULL;
static fortran_sgemv_t *accelerate_sgemv = NULL;
static int AVX_and_10_9 = 0;

/* Dynamic check for AVX support
* __builtin_cpu_supports("avx") is available in gcc 4.8,
* but clang and icc do not currently support it. */
#define cpu_supports_avx()\
(system("sysctl -n machdep.cpu.features | grep -q AVX") == 0)

/* Check if we are using MacOS X version 10.9 */
#define using_mavericks()\
(system("sw_vers -productVersion | grep -q 10\\.9\\.") == 0)

__attribute__((destructor))
static void unloadlib(void)
{
if (veclib) dlclose(veclib);
}

__attribute__((constructor))
static void loadlib()
/* automatically executed on module import */
{
char errormsg[1024];
int AVX, MAVERICKS;
memset((void*)errormsg, 0, sizeof(errormsg));
/* check if the CPU supports AVX */
AVX = cpu_supports_avx();
/* check if the OS is MacOS X Mavericks */
MAVERICKS = using_mavericks();
/* we need the workaround when the CPU supports
* AVX and the OS version is Mavericks */
AVX_and_10_9 = AVX && MAVERICKS;
/* load vecLib */
veclib = dlopen(VECLIB_FILE, RTLD_LOCAL | RTLD_FIRST);
if (!veclib) {
veclib = NULL;
sprintf(errormsg,"Failed to open vecLib from location '%s'.", VECLIB_FILE);
Py_FatalError(errormsg); /* calls abort() and dumps core */
}
/* resolve Fortran SGEMV from Accelerate */
accelerate_sgemv = (fortran_sgemv_t*) dlsym(veclib, "sgemv_");
if (!accelerate_sgemv) {
unloadlib();
sprintf(errormsg,"Failed to resolve symbol 'sgemv_'.");
Py_FatalError(errormsg);
}
/* resolve cblas_sgemv from Accelerate */
accelerate_cblas_sgemv = (cblas_sgemv_t*) dlsym(veclib, "cblas_sgemv");
if (!accelerate_cblas_sgemv) {
unloadlib();
sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemv'.");
Py_FatalError(errormsg);
}
/* resolve cblas_sgemm from Accelerate */
accelerate_cblas_sgemm = (cblas_sgemm_t*) dlsym(veclib, "cblas_sgemm");
if (!accelerate_cblas_sgemm) {
unloadlib();
sprintf(errormsg,"Failed to resolve symbol 'cblas_sgemm'.");
Py_FatalError(errormsg);
}
}

/* ----------------------------------------------------------------- */
/* Fortran SGEMV override */

void sgemv_( const char* trans, const int* m, const int* n,
const float* alpha, const float* A, const int* ldA,
const float* X, const int* incX,
const float* beta, float* Y, const int* incY )
{
/* It is safe to use the original SGEMV if we are not using AVX on Mavericks
* or the input arrays A, X and Y are all aligned on 32 byte boundaries. */
#define BADARRAY(x) (((npy_intp)(void*)x) % 32)
const int use_sgemm = AVX_and_10_9 && (BADARRAY(A) || BADARRAY(X) || BADARRAY(Y));
if (!use_sgemm) {
accelerate_sgemv(trans,m,n,alpha,A,ldA,X,incX,beta,Y,incY);
return;
}

/* Arrays are misaligned, the CPU supports AVX, and we are running
* Mavericks.
*
* Emulation of SGEMV with SGEMM:
*
* SGEMV allows vectors to be strided. SGEMM requires all arrays to be
* contiguous along the leading dimension. To emulate striding in SGEMV
* with the leading dimension arguments in SGEMM we compute
*
* Y = alpha * op(A) @ X + beta * Y
*
* as
*
* Y.T = alpha * X.T @ op(A).T + beta * Y.T
*
* Because Fortran uses column major order and X.T and Y.T are row vectors,
* the leading dimensions of X.T and Y.T in SGEMM become equal to the
* strides of the the column vectors X and Y in SGEMV. */

switch (*trans) {
case 'T':
case 't':
case 'C':
case 'c':
accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasNoTrans,
1, *n, *m, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
break;
case 'N':
case 'n':
accelerate_cblas_sgemm( CblasColMajor, CblasNoTrans, CblasTrans,
1, *m, *n, *alpha, X, *incX, A, *ldA, *beta, Y, *incY );
break;
default:
cblas_xerbla(1, "SGEMV", "Illegal transpose setting: %c\n", *trans);
}
}

/* ----------------------------------------------------------------- */
/* Override for an alias symbol for sgemv_ in Accelerate */

void sgemv (char *trans,
const int *m, const int *n,
const float *alpha,
const float *A, const int *lda,
const float *B, const int *incB,
const float *beta,
float *C, const int *incC)
{
sgemv_(trans,m,n,alpha,A,lda,B,incB,beta,C,incC);
}

/* ----------------------------------------------------------------- */
/* cblas_sgemv override, based on Netlib CBLAS code */

void cblas_sgemv(const enum CBLAS_ORDER order,
const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
const float alpha, const float *A, const int lda,
const float *X, const int incX, const float beta,
float *Y, const int incY)
{
char TA;
if (order == CblasColMajor)
{
if (TransA == CblasNoTrans) TA = 'N';
else if (TransA == CblasTrans) TA = 'T';
else if (TransA == CblasConjTrans) TA = 'C';
else
{
cblas_xerbla(2, "cblas_sgemv","Illegal TransA setting, %d\n", TransA);
}
sgemv_(&TA, &M, &N, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
}
else if (order == CblasRowMajor)
{
if (TransA == CblasNoTrans) TA = 'T';
else if (TransA == CblasTrans) TA = 'N';
else if (TransA == CblasConjTrans) TA = 'N';
else
{
cblas_xerbla(2, "cblas_sgemv", "Illegal TransA setting, %d\n", TransA);
return;
}
sgemv_(&TA, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY);
}
else
cblas_xerbla(1, "cblas_sgemv", "Illegal Order setting, %d\n", order);
}

3 changes: 2 additions & 1 deletion scipy/linalg/setup.py
Expand Up @@ -8,7 +8,7 @@
def configuration(parent_package='',top_path=None):
from numpy.distutils.system_info import get_info, NotFoundError
from numpy.distutils.misc_util import Configuration
from scipy._build_utils import get_g77_abi_wrappers, split_fortran_files
from scipy._build_utils import get_sgemv_fix, get_g77_abi_wrappers, split_fortran_files

config = Configuration('linalg',parent_package,top_path)

Expand All @@ -25,6 +25,7 @@ def configuration(parent_package='',top_path=None):
# fblas:
sources = ['fblas.pyf.src']
sources += get_g77_abi_wrappers(lapack_opt)
sources += get_sgemv_fix(lapack_opt)

config.add_extension('_fblas',
sources=sources,
Expand Down
45 changes: 44 additions & 1 deletion scipy/linalg/tests/test_fblas.py
Expand Up @@ -15,7 +15,7 @@
from scipy.lib.six import xrange

from numpy.testing import TestCase, run_module_suite, assert_array_equal, \
assert_array_almost_equal, assert_
assert_allclose, assert_array_almost_equal, assert_


# decimal accuracy to require between Python and LAPACK/BLAS calculations
Expand Down Expand Up @@ -463,6 +463,49 @@ def test_y_stride_assert(self):
class TestSgemv(TestCase, BaseGemv):
blas_func = fblas.sgemv
dtype = float32

def test_sgemv_on_osx(self):
from itertools import product
import sys
import numpy as np

if sys.platform != 'darwin':
return

def aligned_array(shape, align, dtype, order='C'):
# Make array shape `shape` with aligned at `align` bytes
d = dtype()
# Make array of correct size with `align` extra bytes
N = np.prod(shape)
tmp = np.zeros(N * d.nbytes + align, dtype=np.uint8)
address = tmp.__array_interface__["data"][0]
# Find offset into array giving desired alignment
for offset in range(align):
if (address + offset) % align == 0:
break
tmp = tmp[offset:offset+N*d.nbytes].view(dtype=dtype)
return tmp.reshape(shape, order=order)

def as_aligned(arr, align, dtype, order='C'):
# Copy `arr` into an aligned array with same shape
aligned = aligned_array(arr.shape, align, dtype, order)
aligned[:] = arr[:]
return aligned

def assert_dot_close(A, X, desired):
assert_allclose(self.blas_func(1.0,A,X), desired,
rtol=1e-5, atol=1e-7)

testdata = product((15,32), (10000,), (200,89), ('C','F'))
for align, m, n, a_order in testdata:
A_d = np.random.rand(m, n)
X_d = np.random.rand(n)
desired = np.dot(A_d, X_d)
# Calculation with aligned single precision
A_f = as_aligned(A_d, align, np.float32, order=a_order)
X_f = as_aligned(X_d, align, np.float32, order=a_order)
assert_dot_close(A_f, X_f, desired)

except AttributeError:
class TestSgemv:
pass
Expand Down

0 comments on commit 720f091

Please sign in to comment.