Skip to content

Commit

Permalink
PERF: optimize for contiguous 2D nanmin/nanmax, speedups of up to 12x
Browse files Browse the repository at this point in the history
  • Loading branch information
qwhelan committed Jan 20, 2020
1 parent 37110cf commit 98b59df
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 20 deletions.
2 changes: 2 additions & 0 deletions bottleneck/src/iterators.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,8 @@ init_iter3(iter3 *it, PyArrayObject *a, PyObject *y, PyObject *z, int axis) {
}

#define REDUCE_CONTIGUOUS (it.stride == 1 && ((it.ndim_m2 < 0) || (C_CONTIGUOUS(a) || F_CONTIGUOUS(a))))
#define ONE_CONTIGUOUS (it.stride == 1 && ((it.ndim_m2 >= 0) && ((C_CONTIGUOUS(a) || F_CONTIGUOUS(a)) && (it.axis == axis))))
#define ONE_TRANSPOSE(dtype) ((it.ndim_m2 == 0) && it.astrides[it.ndim_m2] == sizeof(dtype) && ((C_CONTIGUOUS(a) && axis == 0) || (F_CONTIGUOUS(a) && axis == 1)))

#define REDUCE_SPECIALIZE(code) \
if (REDUCE_CONTIGUOUS) { \
Expand Down
166 changes: 146 additions & 20 deletions bottleneck/src/reduce_template.c
Original file line number Diff line number Diff line change
Expand Up @@ -510,29 +510,123 @@ REDUCE_ALL(NAME, DTYPE0) {

BN_OPT_3
REDUCE_ONE(NAME, DTYPE0) {
npy_DTYPE0 extreme;
int allnan;
INIT_ONE(DTYPE0, DTYPE0)
if (LENGTH == 0) {
VALUE_ERR("numpy.NAME raises on a.shape[axis]==0; "
"So Bottleneck too.");
return NULL;
}
BN_BEGIN_ALLOW_THREADS
WHILE {
npy_DTYPE0 extreme = BIG_FLOAT;
npy_bool allnan = 1;
if (ONE_TRANSPOSE(npy_DTYPE0)) {
npy_intp LOOP_SIZE = it.nits;
npy_DTYPE0* extremes = malloc(LOOP_SIZE * sizeof(npy_DTYPE0));
npy_bool* allnans = malloc(LOOP_SIZE * sizeof(npy_bool));
for (npy_intp i=0; i < LOOP_SIZE; i++) {
extremes[i] = BIG_FLOAT;
allnans[i] = 1;
}

const npy_DTYPE0* pa = PA(DTYPE0);
FOR {
const npy_DTYPE0 ai = pa[it.i * it.stride];
if (ai COMPARE extreme) {
extreme = ai;
allnan = 0;
npy_intp i=0;
for (; i < LENGTH; i++) {
for (npy_intp j=0; j < LOOP_SIZE; j++) {
const npy_DTYPE0 ai = pa[i * LOOP_SIZE + j];
if (ai COMPARE extremes[j]) {
allnans[j] = 0;
extremes[j] = ai;
}
}
if (i % 8 == 0) {
npy_intp nan_count = 0;
for (npy_intp j = 0; j < LOOP_SIZE; j++) {
nan_count += allnans[j];
}
if (nan_count == 0) {
break;
}
}
}
if (allnan) {
extreme = BN_NAN;
for (; i < LENGTH; i++) {
for (npy_intp j=0; j < LOOP_SIZE; j++) {
const npy_DTYPE0 ai = pa[i * LOOP_SIZE + j];
extremes[j] = ai INTCOMP extremes[j] ? ai : extremes[j];
}
}
for (npy_intp i=0; i < LOOP_SIZE; i++) {
if (allnans[i]) {
py[i] = BN_NAN;
} else {
py[i] = extremes[i];
}
}
free(extremes);
free(allnans);
} else if (ONE_CONTIGUOUS) {
const npy_intp LOOP_SIZE = DTYPE1;
npy_DTYPE0 extremes[DTYPE1];
const npy_intp loops = LENGTH / LOOP_SIZE;
const npy_intp residual = LENGTH % LOOP_SIZE;
WHILE {
const npy_DTYPE0* pa = PA(DTYPE0);

for (npy_intp i=0; i < LOOP_SIZE; i++) {
extremes[i] = BIG_FLOAT;
}
allnan = 1;
extreme = BIG_FLOAT;

npy_intp i=0;
for (; i < loops && allnan; i++) {
for (npy_intp j=0; j < LOOP_SIZE; j++) {
const npy_DTYPE0 ai = pa[i * LOOP_SIZE + j];
if (ai COMPARE extremes[j]) {
allnan = 0;
extremes[j] = ai;
}
}
}
for (; i < loops; i++) {
for (npy_intp j=0; j < LOOP_SIZE; j++) {
const npy_DTYPE0 ai = pa[i * LOOP_SIZE + j];
extremes[j] = ai INTCOMP extremes[j] ? ai : extremes[j];
}
}
for (npy_intp j=0; j < residual; j++) {
const npy_DTYPE0 ai = pa[i * LOOP_SIZE + j];
if (ai COMPARE extremes[j]) {
allnan = 0;
extremes[j] = ai;
}
}
for (npy_intp j=0; j < LOOP_SIZE; j++) {
if (extremes[j] COMPARE extreme) {
extreme = extremes[j];
}
}

if (allnan) {
extreme = BN_NAN;
}
YPP = extreme;
NEXT
}
} else {
WHILE {
extreme = BIG_FLOAT;
allnan = 1;
FOR {
const npy_DTYPE0 ai = AI(DTYPE0);
if (ai COMPARE extreme) {
extreme = ai;
allnan = 0;
}
}
if (allnan) extreme = BN_NAN;
YPP = extreme;
NEXT
}
YPP = extreme;
NEXT
}
BN_END_ALLOW_THREADS
return y;
Expand Down Expand Up @@ -575,23 +669,55 @@ REDUCE_ALL(NAME, DTYPE0) {

BN_OPT_3
REDUCE_ONE(NAME, DTYPE0) {
npy_DTYPE0 ai, extreme;
npy_DTYPE0 extreme;
INIT_ONE(DTYPE0, DTYPE0)
if (LENGTH == 0) {
VALUE_ERR("numpy.NAME raises on a.shape[axis]==0; "
"So Bottleneck too.");
return NULL;
}
BN_BEGIN_ALLOW_THREADS
WHILE {
extreme = BIG_INT;
if (ONE_TRANSPOSE(npy_DTYPE0)) {
npy_intp LOOP_SIZE = it.nits;
npy_DTYPE0* extremes = malloc(LOOP_SIZE * sizeof(npy_DTYPE0));
for (npy_intp i=0; i < LOOP_SIZE; i++) {
extremes[i] = BIG_INT;
}

const npy_DTYPE0* pa = PA(DTYPE0);
FOR {
ai = SI(pa);
if (ai COMPARE extreme) extreme = ai;
for (npy_intp i=0; i<LENGTH; i++) {
for (npy_intp j=0; j < LOOP_SIZE; j++) {
const npy_DTYPE0 ai = pa[i * LOOP_SIZE + j];
extremes[j] = ai INTCOMP extremes[j] ? ai : extremes[j];
}
}
for (npy_intp i=0; i < LOOP_SIZE; i++) {
py[i] = extremes[i];
}
free(extremes);
} else if (ONE_CONTIGUOUS) {
WHILE {
extreme = BIG_INT;
const npy_DTYPE0* pa = PA(DTYPE0);
FOR {
const npy_DTYPE0 ai = pa[it.i];
extreme = ai INTCOMP extreme ? ai : extreme;
}
YPP = extreme;
NEXT
}
} else {
WHILE {
extreme = BIG_INT;
FOR {
const npy_DTYPE0 ai = AI(DTYPE0);
if (ai INTCOMP extreme) {
extreme = ai;
}
}
YPP = extreme;
NEXT
}
YPP = extreme;
NEXT
}
BN_END_ALLOW_THREADS
return y;
Expand Down

0 comments on commit 98b59df

Please sign in to comment.