Skip to content

Commit

Permalink
PERF: introduce SSE2 implementations of allnan/anynan, ~30% speedup
Browse files Browse the repository at this point in the history
  • Loading branch information
qwhelan committed Feb 18, 2020
1 parent cd72634 commit cd32416
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 40 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:

- os: linux
arch: ppc64le
env: TEST_DEPS="numpy pytest hypothesis"
env: TEST_DEPS="numpy pytest hypothesis gfortran_linux-ppc64le=8.2.0"
PYTHON_VERSION="3.7"
PYTHON_ARCH="64"

Expand Down Expand Up @@ -98,7 +98,7 @@ install:
- source "tools/travis/conda_install.sh"

script:
- source "tools/travis/bn_setup.sh"
- ./tools/travis/bn_setup.sh

notifications:
email:
Expand Down
12 changes: 12 additions & 0 deletions bottleneck/src/bn_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
("HAVE_ATTRIBUTE_OPTIMIZE_OPT_3", '__attribute__((optimize("O3")))')
]

OPTIONAL_HEADERS = [("HAVE_SSE2", "emmintrin.h")]

OPTIONAL_INTRINSICS = [
("HAVE___BUILTIN_ISNAN", "__builtin_isnan", "0."),
("HAVE_ISNAN", "isnan", "0."),
Expand Down Expand Up @@ -101,6 +103,10 @@ def check_gcc_function_attribute(cmd, attribute, name):
return False


def check_gcc_header(cmd, header):
return cmd.check_header(header)


def check_gcc_intrinsic(cmd, intrinsic, value) -> bool:
"""Return True if the given intrinsic is supported."""
body = (
Expand Down Expand Up @@ -142,6 +148,12 @@ def create_config_h(config):
else:
output.append((config_attr, "0"))

for config_attr, header in OPTIONAL_HEADERS:
if check_gcc_header(config, header):
output.append((config_attr, "1"))
else:
output.append((config_attr, "0"))

for config_attr, intrinsic, value in OPTIONAL_INTRINSICS:
if check_gcc_intrinsic(config, intrinsic, value):
output.append((config_attr, "1"))
Expand Down
136 changes: 105 additions & 31 deletions bottleneck/src/reduce_template.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
#include "bottleneck.h"
#include "iterators.h"

#ifdef _MSC_VER
#include "intrin.h"
#elif HAVE_SSE2
#include "x86intrin.h"
#endif

/* init macros ----------------------------------------------------------- */

#define INIT_ALL \
Expand Down Expand Up @@ -1137,6 +1143,40 @@ nanmedian(PyObject *self, PyObject *args, PyObject *kwds) {
0);
}

/* SSE2 anynan/allnan ---------------------------------------------------- */

#if HAVE_SSE2

int inline sse2_allnan_float64(const npy_float64* pa) {
__m128d values = _mm_load_pd(pa);
__m128d result = _mm_cmpneq_pd(values, values);
/* If a value is NaN, a bit gets set to 1. As 2 64-bit floats fit in a
128-bit vector, 11 = 3 */
return _mm_movemask_pd(result) != 3;
}

int inline sse2_allnan_float32(const npy_float32* pa) {
__m128 values = _mm_load_ps(pa);
__m128 result = _mm_cmpneq_ps(values, values);
/* If a value is NaN, a bit gets set to 1. As 4 32-bit floats fit in a
128-bit vector, 1111 = 15 */
return _mm_movemask_ps(result) != 15;
}

int inline sse2_anynan_float64(const npy_float64* pa) {
__m128d values = _mm_load_pd(pa);
__m128d result = _mm_cmpneq_pd(values, values);
return _mm_movemask_pd(result) > 0;
}

int inline sse2_anynan_float32(const npy_float32* pa) {
__m128 values = _mm_load_ps(pa);
__m128 result = _mm_cmpneq_ps(values, values);
return _mm_movemask_ps(result) > 0;
}

#endif

/* anynan ---------------------------------------------------------------- */

/* dtype = [['float64'], ['float32']] */
Expand All @@ -1146,32 +1186,49 @@ REDUCE_ALL(anynan, DTYPE0) {
INIT_ALL
BN_BEGIN_ALLOW_THREADS
if (REDUCE_CONTIGUOUS) {
const npy_intp LOOP_SIZE = 512 / sizeof(npy_DTYPE0);
const npy_intp count = it.nits * it.length;
const npy_intp loop_count = count / LOOP_SIZE;
const npy_intp residual = count % LOOP_SIZE;
const int VECTOR_ALIGN = 16;
const npy_DTYPE0* pa = PA(DTYPE0);
npy_bool* f_arr = malloc(LOOP_SIZE * sizeof(npy_bool));
for (npy_intp j=0; j < LOOP_SIZE; j++) {
f_arr[j] = 0;
const npy_intp count = it.nits * it.length;
npy_intp vector_offset = (((npy_intp) pa) % VECTOR_ALIGN);
vector_offset = vector_offset > 0 ? VECTOR_ALIGN - vector_offset : 0;
vector_offset /= sizeof(npy_DTYPE0);
vector_offset = vector_offset <= count ? vector_offset : count;

const npy_intp LOOP_SIZE = 16 / sizeof(npy_DTYPE0);
const npy_intp loop_count = (count - vector_offset) / LOOP_SIZE;
const npy_intp residual = (count - vector_offset) % LOOP_SIZE;

for (npy_intp i=0; (i < vector_offset) && (f == 0); i++) {
const npy_DTYPE0 ai = pa[i];
if (bn_isnan(ai)) {
f = 1;
}
}

for (npy_intp i=0; (i < loop_count) && (f == 0); i++) {
for (npy_intp j=0; j < LOOP_SIZE; j++) {
f_arr[j] = bn_isnan(pa[i * LOOP_SIZE + j]);
#if HAVE_SSE2
for (npy_intp i=0; (i < loop_count) && (f == 0); i++) {
f = sse2_anynan_DTYPE0(&pa[vector_offset + i * LOOP_SIZE]);
}
#else
npy_bool* f_arr = malloc(LOOP_SIZE * sizeof(npy_bool));
for (npy_intp i=0; (i < loop_count) && (f == 0); i++) {
for (npy_intp j=0; j < LOOP_SIZE; j++) {
f_arr[j] = bn_isnan(pa[vector_offset + i * LOOP_SIZE + j]);
}

for (npy_intp j=0; j < LOOP_SIZE; j++) {
f += f_arr[j];
for (npy_intp j=0; j < LOOP_SIZE; j++) {
f += f_arr[j];
}
}
}
free(f_arr);
#endif

for (npy_intp j=0; (j < residual) && (f == 0); j++) {
const npy_DTYPE0 ai = pa[loop_count * LOOP_SIZE + j];
const npy_DTYPE0 ai = pa[vector_offset + loop_count * LOOP_SIZE + j];
if (bn_isnan(ai)) {
f = 1;
}
}
free(f_arr);
} else {
WHILE {
const npy_DTYPE0* pa = PA(DTYPE0);
Expand Down Expand Up @@ -1244,37 +1301,54 @@ REDUCE_MAIN(anynan, 0)
/* dtype = [['float64'], ['float32']] */
BN_OPT_3
REDUCE_ALL(allnan, DTYPE0) {
npy_bool f = 0;
int f = 0;
npy_DTYPE0 ai;
INIT_ALL
BN_BEGIN_ALLOW_THREADS
if (REDUCE_CONTIGUOUS) {
const npy_intp LOOP_SIZE = 512 / sizeof(npy_DTYPE0);
const npy_intp count = it.nits * it.length;
const npy_intp loop_count = count / LOOP_SIZE;
const npy_intp residual = count % LOOP_SIZE;
const int VECTOR_ALIGN = 16;
const npy_DTYPE0* pa = PA(DTYPE0);
npy_bool* f_arr = malloc(LOOP_SIZE * sizeof(npy_bool));
for (npy_intp j=0; j < LOOP_SIZE; j++) {
f_arr[j] = 0;
const npy_intp count = it.nits * it.length;
npy_intp vector_offset = (((npy_intp) pa) % VECTOR_ALIGN);
vector_offset = vector_offset > 0 ? VECTOR_ALIGN - vector_offset : 0;
vector_offset /= sizeof(npy_DTYPE0);
vector_offset = vector_offset <= count ? vector_offset : count;

const npy_intp LOOP_SIZE = 16 / sizeof(npy_DTYPE0);
const npy_intp loop_count = (count - vector_offset) / LOOP_SIZE;
const npy_intp residual = (count - vector_offset) % LOOP_SIZE;

for (npy_intp i=0; (i < vector_offset) && (f == 0); i++) {
const npy_DTYPE0 ai = pa[i];
if (!bn_isnan(ai)) {
f = 1;
}
}

for (npy_intp i=0; (i < loop_count) && (f == 0); i++) {
for (npy_intp j=0; j < LOOP_SIZE; j++) {
f_arr[j] = !bn_isnan(pa[i * LOOP_SIZE + j]);
#if HAVE_SSE2
for (npy_intp i=0; (i < loop_count) && (f == 0); i++) {
f = sse2_allnan_DTYPE0(&pa[vector_offset + i * LOOP_SIZE]);
}
#else
npy_bool* f_arr = malloc(LOOP_SIZE * sizeof(npy_bool));
for (npy_intp i=0; (i < loop_count) && (f == 0); i++) {
for (npy_intp j=0; j < LOOP_SIZE; j++) {
f_arr[j] = !bn_isnan(pa[vector_offset + i * LOOP_SIZE + j]);
}

for (npy_intp j=0; j < LOOP_SIZE; j++) {
f += f_arr[j];
for (npy_intp j=0; j < LOOP_SIZE; j++) {
f += f_arr[j];
}
}
}
free(f_arr);
#endif

for (npy_intp j=0; (j < residual) && (f == 0); j++) {
const npy_DTYPE0 ai = pa[loop_count * LOOP_SIZE + j];
const npy_DTYPE0 ai = pa[vector_offset + loop_count * LOOP_SIZE + j];
if (!bn_isnan(ai)) {
f = 1;
}
}
free(f_arr);
} else {
WHILE {
FOR {
Expand Down
20 changes: 16 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from distutils.command.config import config as _config
import versioneer
import shutil
import platform


class config(_config):
Expand Down Expand Up @@ -97,18 +98,29 @@ def build_extensions(self):
sys.path.append(os.path.join(os.path.dirname(__file__), "bottleneck/src"))


def get_cpu_arch_flags():
if platform.processor() == "ppc64le":
# Needed to support SSE2 intrinsics
return ["-DNO_WARN_X86_INTRINSICS"]
else:
return []


def prepare_modules():
base_includes = [
"bottleneck/src/bottleneck.h",
"bottleneck/src/bn_config.h",
"bottleneck/src/iterators.h",
]

arch_flags = get_cpu_arch_flags()

ext = [
Extension(
"bottleneck.reduce",
sources=["bottleneck/src/reduce.c"],
depends=base_includes,
extra_compile_args=["-O2"],
extra_compile_args=["-O2"] + arch_flags,
)
]
ext += [
Expand All @@ -119,23 +131,23 @@ def prepare_modules():
"bottleneck/src/move_median/move_median.c",
],
depends=base_includes + ["bottleneck/src/move_median/move_median.h"],
extra_compile_args=["-O2"],
extra_compile_args=["-O2"] + arch_flags,
)
]
ext += [
Extension(
"bottleneck.nonreduce",
sources=["bottleneck/src/nonreduce.c"],
depends=base_includes,
extra_compile_args=["-O2"],
extra_compile_args=["-O2"] + arch_flags,
)
]
ext += [
Extension(
"bottleneck.nonreduce_axis",
sources=["bottleneck/src/nonreduce_axis.c"],
depends=base_includes,
extra_compile_args=["-O2"],
extra_compile_args=["-O2"] + arch_flags,
)
]
return ext
Expand Down
5 changes: 2 additions & 3 deletions tools/travis/bn_setup.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,9 @@ else
pip install "${ARCHIVE[0]}"
elif [ "${TEST_RUN}" != "coverage" ]; then
# CFLAGS gets ignored by PEP 518, so do coverage from inplace build
pip install --upgrade --user pip
pip install --user "."
pip install "."
fi
python setup.py build_ext --user --inplace
python setup.py build_ext --inplace
if [ "${TEST_RUN}" == "doc" ]; then
make doc
elif [ "${TEST_RUN}" == "coverage" ]; then
Expand Down

0 comments on commit cd32416

Please sign in to comment.