diff --git a/doc/articles/block_index.py b/doc/articles/block_index.py index 395c9953..6eb5766a 100644 --- a/doc/articles/block_index.py +++ b/doc/articles/block_index.py @@ -8,54 +8,22 @@ from itertools import repeat import pickle +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + from arraykit import BlockIndex # from arraykit import ErrorInitTypeBlocks -from arraykit import shape_filter -from arraykit import resolve_dtype import arraykit as ak -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd - sys.path.append(os.getcwd()) +from performance.reference.block_index import from_blocks + -def from_blocks( - raw_blocks: tp.Iterable[np.ndarray], - ): - index: tp.List[tp.Tuple[int, int]] = [] # columns position to blocks key - block_count = 0 - row_count = None - column_count = 0 - dtype = None - - for block in raw_blocks: - if not block.__class__ is np.ndarray: - raise ErrorInitTypeBlocks(f'found non array block: {block}') - if block.ndim > 2: - raise ErrorInitTypeBlocks(f'cannot include array with {block.ndim} dimensions') - - r, c = shape_filter(block) - - if row_count is not None and r != row_count: #type: ignore [unreachable] - raise ErrorInitTypeBlocks(f'mismatched row count: {r}: {row_count}') - else: - row_count = r - if c == 0: - continue - - if dtype is None: - dtype = block.dtype - else: - dtype = resolve_dtype(dtype, block.dtype) - - for i in range(c): - index.append((block_count, i)) - column_count += c - block_count += 1 - return (row_count, column_count), index + +#------------------------------------------------------------------------------- class ArrayProcessor: NAME = '' @@ -78,6 +46,7 @@ def __init__(self, arrays: tp.Iterable[np.ndarray]): self.selector_bool_array = (np.arange(len(self.bi)) % 2) == 0 self.selector_slice = slice(0, len(self.bi), 2) + #------------------------------------------------------------------------------- class BlockIndexLoad(ArrayProcessor): NAME = 'BlockIndex: load' @@ -223,13 +192,11 @@ class TupleIndexIterBoolArray(ArrayProcessor): def __call__(self): ti = self.ti - _ = [ti[i] for i in self.selector_bool_array if i] - - + _ = [ti[i] for i, b in enumerate(self.selector_bool_array) if b] #------------------------------------------------------------------------------- -NUMBER = 50 +NUMBER = 5 def seconds_to_display(seconds: float) -> str: seconds /= NUMBER diff --git a/performance/reference/block_index.py b/performance/reference/block_index.py new file mode 100644 index 00000000..0b516050 --- /dev/null +++ b/performance/reference/block_index.py @@ -0,0 +1,190 @@ + +from arraykit import shape_filter +from arraykit import resolve_dtype + +import typing as tp +import numpy as np + +#------------------------------------------------------------------------------- +def from_blocks( + raw_blocks: tp.Iterable[np.ndarray], + ): + '''Simulation of legacy routine within TypeBlocks. + ''' + index: tp.List[tp.Tuple[int, int]] = [] # columns position to blocks key + block_count = 0 + row_count = None + column_count = 0 + dtype = None + + for block in raw_blocks: + if not block.__class__ is np.ndarray: + raise ErrorInitTypeBlocks(f'found non array block: {block}') + if block.ndim > 2: + raise ErrorInitTypeBlocks(f'cannot include array with {block.ndim} dimensions') + + r, c = shape_filter(block) + + if row_count is not None and r != row_count: #type: ignore [unreachable] + raise ErrorInitTypeBlocks(f'mismatched row count: {r}: {row_count}') + else: + row_count = r + if c == 0: + continue + + if dtype is None: + dtype = block.dtype + else: + dtype = resolve_dtype(dtype, block.dtype) + + for i in range(c): + index.append((block_count, i)) + column_count += c + block_count += 1 + return (row_count, column_count), index + +#------------------------------------------------------------------------------- + + +def cols_to_slice(indices: tp.Sequence[int]) -> slice: + '''Translate an iterable of contiguous integers into a slice. +Integers are assumed to be ordered (ascending or descending) and contiguous. + ''' + start_idx = indices[0] + # single column as a single slice + if len(indices) == 1: + return slice(start_idx, start_idx + 1) + + stop_idx = indices[-1] + if stop_idx > start_idx: # ascending indices + return slice(start_idx, stop_idx + 1) + + if stop_idx == 0: + return slice(start_idx, None, -1) + # stop is less than start, need to reduce by 1 to cover range + return slice(start_idx, stop_idx - 1, -1) + +def indices_to_contiguous_pairs(indices: tp.Iterable[tp.Tuple[int, int]] + ) -> tp.Iterator[tp.Tuple[int, slice]]: + '''Indices are pairs of (block_idx, value); convert these to pairs of (block_idx, slice) when we identify contiguous indices +within a block (these are block slices) + ''' + # store pairs of block idx, ascending col list + last: tp.Optional[tp.Tuple[int, int]] = None + + for block_idx, col in indices: + if not last: + last = (block_idx, col) + bundle = [col] + continue + if last[0] == block_idx and abs(col - last[1]) == 1: + # if contiguous, update last, add to bundle + last = (block_idx, col) + # do not need to store all col, only the last, + # however probably easier to just accumulate all + bundle.append(col) + continue + # either new block, or not contiguous on same block + yield (last[0], cols_to_slice(bundle)) + # start a new bundle + bundle = [col] + last = (block_idx, col) + + # last can be None + if last and bundle: + yield (last[0], cols_to_slice(bundle)) + + +class IterContiguous: + def __init__(self, indices): + self.indices = iter(indices) + self.last_block = -1 + self.last_column = -1 + self.next_block = -1 + self.next_column = -1 + + @staticmethod + def build_slice(start, end_inclusive): + # this works, but we reatain slices to force 2D selections; we might explore changing this + # if start == end_inclusive: + # return start + + if start <= end_inclusive: + return slice(start, end_inclusive + 1, None) # can be 1 + # reverse slice + if end_inclusive == 0: + return slice(start, None, -1) + return slice(start, end_inclusive - 1, -1) + + def getter(self) -> tp.Tuple[int, slice]: + slice_start = -1 + while True: + if self.next_block == -2: + return None # terminate the loop + if self.next_block != -1: + # discontinuity found on last iteration, set new start + self.last_block = self.next_block + self.last_column = self.next_column + slice_start = self.last_column + self.next_block = -1 # clear next state + self.next_column = -1 + + try: + block, column = next(self.indices) + except StopIteration: + # no more pairs, but set a previous slice_start that has not been emitted + # return that now, and flag for end on next call + self.next_block = -2 + return self.last_block, self.build_slice(slice_start, self.last_column) + + if self.last_block == -1: + # initialization + self.last_block = block + self.last_column = column + slice_start = column + continue + + if self.last_block == block and abs(column - self.last_column) == 1: # contiguous + self.last_column = column + continue + + # not contiguous, need to emit a slice for previous region + # store this block, column as next, so we have + self.next_block = block + self.next_column = column + return self.last_block, self.build_slice(slice_start, self.last_column) + + + def iter(self) -> tp.Iterator[tp.Tuple[int, slice]]: + while True: + post = self.getter() + if post is not None: + yield post + else: + break + +#------------------------------------------------------------------------------- + + + +if __name__ == '__main__': + samples = ( + [(0, 0), (0, 1), (0, 2), (1, 1), (1, 3), (2, 0), (3, 0), (3, 1), (3, 2)], + [(0, 0), (2, 1), (3, 5), (10, 1)], + [(0, 0), (2, 1), (2, 2), (2, 5), (2, 6), (10, 1)], + [(10, 1)], + [(0, 1), (0, 2), (0, 3), (0, 4)], + [(0, 0), (2, 3), (2, 2), (2, 1), (2, 6), (10, 1)], + [(2, 3), (0, 0), (2, 2), (2, 1), (2, 6), (2, 7)], + [(2, 3), (2, 2), (5, 2), (5, 1), (5, 0), (2, 1), (2, 0)], + + ) + for sample in samples: + p1 = list(indices_to_contiguous_pairs(sample)) + print(sample) + print(p1) + + + iterc = IterContiguous(sample) + p2 = list(iterc.iter()) + print(p2) \ No newline at end of file diff --git a/src/__init__.pyi b/src/__init__.pyi index 4e3cf22d..afb6a5db 100644 --- a/src/__init__.pyi +++ b/src/__init__.pyi @@ -46,6 +46,12 @@ class BlockIndex: def iter_select(self, __key: tp.Union[slice, np.ndarray, tp.List[int]], ) -> tp.Iterator[tp.Tuple[int, int]]: ... + def iter_contiguous(self, + __key: tp.Union[slice, np.ndarray, tp.List[int]], + ascending: bool = False, + reduce: bool = False, + ) -> tp.Iterator[tp.Tuple[int, int]]: ... + def iterable_str_to_array_1d( iterable: tp.Iterable[str], diff --git a/src/_arraykit.c b/src/_arraykit.c index 06aa706d..566f0788 100644 --- a/src/_arraykit.c +++ b/src/_arraykit.c @@ -3267,7 +3267,7 @@ shape_filter(PyObject *Py_UNUSED(m), PyObject *a) { npy_intp size0 = PyArray_DIM(array, 0); // If 1D array, set size for axis 1 at 1, else use 2D array to get the size of axis 1 npy_intp size1 = PyArray_NDIM(array) == 1 ? 1 : PyArray_DIM(array, 1); - return Py_BuildValue("ii", size0, size1); + return Py_BuildValue("nn", size0, size1); } // Reshape if necessary a flat ndim 1 array into a 2D array with one columns and rows of length. @@ -3324,19 +3324,57 @@ row_1d_filter(PyObject *Py_UNUSED(m), PyObject *a) } -// Convert any slice to an ascending slice that covers the same values. -static PyObject * -slice_to_ascending_slice(PyObject *Py_UNUSED(m), PyObject *args) { +// Returns a new ref; returns NULL on error. Any start or stop less than 0 will be set to NULL. +static inline PyObject* +AK_build_slice(Py_ssize_t start, Py_ssize_t stop, Py_ssize_t step) +{ + PyObject* py_start = NULL; + PyObject* py_stop = NULL; + PyObject* py_step = NULL; - PyObject* slice; - PyObject* size; - if (!PyArg_ParseTuple(args, - "O!O!:slice_to_ascending_slice", - &PySlice_Type, &slice, - &PyLong_Type, &size)) { - return NULL; + if (start >= 0) { + py_start = PyLong_FromSsize_t(start); + if (py_start == NULL) {return NULL;} + } + if (stop >= 0) { + py_stop = PyLong_FromSsize_t(stop); + if (py_stop == NULL) {return NULL;} } + // do not set a step if not necessary + if (step != 0 && step != 1) { + py_step = PyLong_FromSsize_t(step); + if (py_step == NULL) {return NULL;} + } + + // might be NULL, let return + PyObject* new = PySlice_New(py_start, py_stop, py_step); + + Py_XDECREF(py_start); + Py_XDECREF(py_stop); + Py_XDECREF(py_step); + + return new; +} +// Given inclusive start, end indices, returns a new reference to a slice. Returns NULL on error. If `reduce` is True, single width slices return an integer. +static inline PyObject* +AK_build_slice_inclusive(Py_ssize_t start, Py_ssize_t end, bool reduce) +{ + if (reduce && start == end) { + return PyLong_FromSsize_t(start); // new ref + } + // assert(start >= 0); + if (start <= end) { + return AK_build_slice(start, end + 1, 1); + } + // end of 0 goes to -1, gets converted to None + return AK_build_slice(start, end - 1, -1); +} + +// Utility function for converting slices; returns NULL on error; returns a new reference. +static inline PyObject* +AK_slice_to_ascending_slice(PyObject* slice, Py_ssize_t size) +{ Py_ssize_t step_count = -1; Py_ssize_t start = 0; Py_ssize_t stop = 0; @@ -3350,24 +3388,32 @@ slice_to_ascending_slice(PyObject *Py_UNUSED(m), PyObject *args) { return slice; } step_count = PySlice_AdjustIndices( - PyLong_AsSsize_t(size), + size, &start, &stop, step); - PyObject* asc_stop = PyLong_FromSsize_t(start + 1); // step will be negative; shift original start value down to find new start - PyObject* asc_start = PyLong_FromSsize_t(start + (step * (step_count - 1))); - PyObject* asc_step = PyLong_FromSsize_t(-step); - - // might be NULL, let return - PyObject* asc = PySlice_New(asc_start, asc_stop, asc_step); + return AK_build_slice( + start + (step * (step_count - 1)), + start + 1, + -step); +} - Py_DECREF(asc_start); - Py_DECREF(asc_stop); - Py_DECREF(asc_step); +// Convert any slice to an ascending slice that covers the same values. +static PyObject * +slice_to_ascending_slice(PyObject *Py_UNUSED(m), PyObject *args) { - return asc; + PyObject* slice; + PyObject* size; + if (!PyArg_ParseTuple(args, + "O!O!:slice_to_ascending_slice", + &PySlice_Type, &slice, + &PyLong_Type, &size)) { + return NULL; + } + // will delegate NULL on eroror + return AK_slice_to_ascending_slice(slice, PyLong_AsSsize_t(size)); } //------------------------------------------------------------------------------ @@ -3443,8 +3489,7 @@ first_true_1d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|$p:first_true_1d", first_true_1d_kwarg_names, - &PyArray_Type, - &array, + &PyArray_Type, &array, &forward )) { return NULL; @@ -3458,7 +3503,7 @@ first_true_1d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) return NULL; } if (!PyArray_IS_C_CONTIGUOUS(array)) { - PyErr_SetString(PyExc_ValueError, "Array must be continguous"); + PyErr_SetString(PyExc_ValueError, "Array must be contiguous"); return NULL; } @@ -3700,8 +3745,6 @@ first_true_2d(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) } - - static PyObject * dtype_from_element(PyObject *Py_UNUSED(m), PyObject *arg) { @@ -4178,7 +4221,7 @@ typedef struct BlockIndexObject { Py_ssize_t bir_capacity; BlockIndexRecord* bir; PyArray_Descr* dtype; - int8_t shape_recache; + bool shape_recache; PyObject* shape; } BlockIndexObject; @@ -4194,16 +4237,6 @@ AK_BI_item(BlockIndexObject* self, Py_ssize_t i) { return Py_BuildValue("nn", biri->block, biri->column); // maybe NULL } -// Returns a new reference to tuple. Returns NULL on error. Supports negative numbers up to negative length. -static inline PyObject* -AK_BI_item_wraps(BlockIndexObject* self, Py_ssize_t i) -{ - if (i < 0) { - i = self->bir_count + i; - } - return AK_BI_item(self, i); -} - //------------------------------------------------------------------------------ // BI Iterator static PyTypeObject BIIterType; @@ -4291,13 +4324,13 @@ static PyTypeObject BIIterType = { // BI Iterator sequence selection static PyTypeObject BIIterSeqType; static PyTypeObject BIIterSliceType; -static PyTypeObject BIIterBooleanType; +static PyTypeObject BIIterBoolType; typedef enum BIIterSelectorKind { - BIIS_SEQUENCE, // BIIterSeqType + BIIS_SEQ, // BIIterSeqType BIIS_SLICE, - BIIS_BOOLEAN, // BIIterBooleanType + BIIS_BOOLEAN, // BIIterBoolType BIIS_UNKNOWN } BIIterSelectorKind; @@ -4306,7 +4339,8 @@ static PyObject * BIIterSelector_new(BlockIndexObject *bi, PyObject* selector, int8_t reversed, - BIIterSelectorKind kind + BIIterSelectorKind kind, + int8_t ascending ); typedef struct BIIterSeqObject { @@ -4332,20 +4366,22 @@ BIIterSeq_iter(BIIterSeqObject *self) { return self; } -static PyObject * -BIIterSeq_iternext(BIIterSeqObject *self) { +// Returns -1 on end of sequence; return -1 with exception set on +static inline Py_ssize_t +BIIterSeq_iternext_core(BIIterSeqObject *self) +{ Py_ssize_t i; if (self->reversed) { i = self->len - ++self->pos; if (i < 0) { - return NULL; + return -1; } } else { i = self->pos++; } if (self->len <= i) { - return NULL; + return -1; } // use i to get index from selector Py_ssize_t t = 0; @@ -4385,15 +4421,34 @@ BIIterSeq_iternext(BIIterSeqObject *self) { } else { PyErr_SetString(PyExc_TypeError, "element type not suitable for indexing"); - return NULL; + return -1; } } - return AK_BI_item_wraps(self->bi, t); // return new ref + if (t < 0) { + t = self->bi->bir_count + t; + } + // we have to ensure valid range here to set an index error and distinguish from end of iteration + if (!((size_t)t < (size_t)self->bi->bir_count)) { + PyErr_SetString(PyExc_IndexError, "index out of range"); + return -1; + } + return t; +} + + +static PyObject * +BIIterSeq_iternext(BIIterSeqObject *self) +{ + Py_ssize_t i = BIIterSeq_iternext_core(self); + if (i == -1) { + return NULL; // an error is set + } + return AK_BI_item(self->bi, i); // return new ref } static PyObject * BIIterSeq_reversed(BIIterSeqObject *self) { - return BIIterSelector_new(self->bi, self->selector, !self->reversed, BIIS_SEQUENCE); + return BIIterSelector_new(self->bi, self->selector, !self->reversed, BIIS_SEQ, 0); } static PyObject * @@ -4447,20 +4502,32 @@ BIIterSlice_iter(BIIterSliceObject *self) { return self; } -static PyObject * -BIIterSlice_iternext(BIIterSliceObject *self) { +static inline Py_ssize_t +BIIterSlice_iternext_core(BIIterSliceObject *self) +{ if (self->len == 0 || self->count >= self->len) { - return NULL; + return -1; } Py_ssize_t i = self->pos; self->pos += self->step; self->count++; // by counting index we we do not need to compare to stop + // i will never be out of range + return i; +} + +// NOTE: this does not use `reversed`, as pos, step, and count are set in BIIterSelector_new +static PyObject * +BIIterSlice_iternext(BIIterSliceObject *self) { + Py_ssize_t i = BIIterSlice_iternext_core(self); + if (i == -1) { + return NULL; + } return AK_BI_item(self->bi, i); // return new ref } static PyObject * BIIterSlice_reversed(BIIterSliceObject *self) { - return BIIterSelector_new(self->bi, self->selector, !self->reversed, BIIS_SLICE); + return BIIterSelector_new(self->bi, self->selector, !self->reversed, BIIS_SLICE, 0); } static PyObject * @@ -4511,8 +4578,9 @@ BIIterBoolean_iter(BIIterBooleanObject *self) { return self; } -static PyObject * -BIIterBoolean_iternext(BIIterBooleanObject *self) { +static inline Py_ssize_t +BIIterBoolean_iternext_core(BIIterBooleanObject *self) +{ npy_bool v = 0; Py_ssize_t i = -1; PyArrayObject* a = (PyArrayObject*) self->selector; @@ -4540,24 +4608,32 @@ BIIterBoolean_iternext(BIIterBooleanObject *self) { } } if (i != -1) { - return AK_BI_item(self->bi, i); // return new ref + return i; } - return NULL; // no True remain + return -1; // no True remain +} + +static PyObject * +BIIterBoolean_iternext(BIIterBooleanObject *self) { + Py_ssize_t i = BIIterBoolean_iternext_core(self); + if (i == -1) { + return NULL; + } + return AK_BI_item(self->bi, i); // return new ref } static PyObject * BIIterBoolean_reversed(BIIterBooleanObject *self) { - return BIIterSelector_new(self->bi, self->selector, !self->reversed, BIIS_BOOLEAN); + return BIIterSelector_new(self->bi, self->selector, !self->reversed, BIIS_BOOLEAN, 0); } // NOTE: no length hint given as we would have to traverse whole array and count True... not sure it is worht it. static PyMethodDef BIiterBoolean_methods[] = { - // {"__length_hint__", (PyCFunction)BIIterBoolean_length_hint, METH_NOARGS, NULL}, {"__reversed__", (PyCFunction)BIIterBoolean_reversed, METH_NOARGS, NULL}, {NULL}, }; -static PyTypeObject BIIterBooleanType = { +static PyTypeObject BIIterBoolType = { PyVarObject_HEAD_INIT(NULL, 0) .tp_basicsize = sizeof(BIIterBooleanObject), .tp_dealloc = (destructor) BIIterBoolean_dealloc, @@ -4567,17 +4643,194 @@ static PyTypeObject BIIterBooleanType = { .tp_name = "arraykit.BlockIndexIteratorBoolean", }; +//------------------------------------------------------------------------------ +// BI Iterator Contigous +static PyTypeObject BIIterContiguousType; + +typedef struct BIIterContiguousObject { + PyObject_VAR_HEAD + BlockIndexObject *bi; + PyObject* iter; // own refernce to core iterator + int8_t reversed; + Py_ssize_t last_block; + Py_ssize_t last_column; + Py_ssize_t next_block; + Py_ssize_t next_column; + bool reduce; // optionally reduce slices to integers +} BIIterContiguousObject; + +static PyObject * +BIIterContiguous_new(BlockIndexObject *bi, + int8_t reversed, + PyObject* iter, + bool reduce) +{ + BIIterContiguousObject *bii = PyObject_New(BIIterContiguousObject, &BIIterContiguousType); + if (!bii) { + return NULL; + } + Py_INCREF(bi); + bii->bi = bi; + Py_INCREF(iter); + bii->iter = iter; + bii->reversed = reversed; + + bii->last_block = -1; + bii->last_column = -1; + bii->next_block = -1; + bii->next_column = -1; + bii->reduce = reduce; + + return (PyObject *)bii; +} + +static void +BIIterContiguous_dealloc(BIIterContiguousObject *self) { + Py_DECREF(self->bi); + Py_DECREF(self->iter); + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static BIIterContiguousObject * +BIIterContiguous_iter(BIIterContiguousObject *self) { + Py_INCREF(self); + return self; +} + +static PyObject * +BIIterContiguous_iternext(BIIterContiguousObject *self) { + Py_ssize_t i = -1; + PyObject* iter = self->iter; + PyTypeObject* type = Py_TYPE(iter); + + Py_ssize_t slice_start = -1; + Py_ssize_t block; + Py_ssize_t column; + + while (1) { + if (self->next_block == -2) { + break; // terminate + } + if (self->next_block != -1) { + // discontinuity found on last iteration, set new start + self->last_block = self->next_block; + self->last_column = slice_start = self->next_column; + self->next_block = self->next_column = -1; // clear next state + } + if (type == &BIIterSeqType) { + i = BIIterSeq_iternext_core((BIIterSeqObject*)iter); + } + else if (type == &BIIterSliceType) { + i = BIIterSlice_iternext_core((BIIterSliceObject*)iter); + } + else if (type == &BIIterBoolType) { + i = BIIterBoolean_iternext_core((BIIterBooleanObject*)iter); + } + if (i == -1) { // end of iteration or error + if (PyErr_Occurred()) { + break; + } + // no more pairs, return previous slice_start, flag for end on next call + self->next_block = -2; + return Py_BuildValue("nN", // N steals ref + self->last_block, + AK_build_slice_inclusive(slice_start, + self->last_column, + self->reduce)); + } + // i is gauranteed to be within the range of self->bit_count at this point; the only source of arbitrary indices is in BIIterSeq_iternext_core, and that function validates the range + BlockIndexRecord* biri = &self->bi->bir[i]; + block = biri->block; + column = biri->column; + + // inititialization + if (self->last_block == -1) { + self->last_block = block; + self->last_column = column; + slice_start = column; + continue; + } + if (self->last_block == block && llabs(column - self->last_column) == 1) { + // contiguious region found, can be postive or negative + self->last_column = column; + continue; + } + self->next_block = block; + self->next_column = column; + return Py_BuildValue("nN", // N steals ref + self->last_block, + AK_build_slice_inclusive(slice_start, + self->last_column, + self->reduce)); + } + return NULL; +} + +static PyObject * +BIIterContiguous_reversed(BIIterContiguousObject *self) { + + PyObject* selector = NULL; + PyTypeObject* type = Py_TYPE(self->iter); + if (type == &BIIterSeqType) { + selector = ((BIIterSeqObject*)self->iter)->selector; + } + else if (type == &BIIterSliceType) { + selector = ((BIIterSliceObject*)self->iter)->selector; + } + else if (type == &BIIterBoolType) { + selector = ((BIIterBooleanObject*)self->iter)->selector; + } + + if (selector == NULL) { + return NULL; + } + + PyObject* iter = BIIterSelector_new(self->bi, + selector, + !self->reversed, + BIIS_UNKNOWN, // let type be determined by selector + 0); + PyObject* biiter = BIIterContiguous_new(self->bi, + !self->reversed, + self->iter, + self->reduce); + Py_DECREF(iter); + return biiter; +} + + +// not implementing __length_hint__ +static PyMethodDef BIIterContiguous_methods[] = { + {"__reversed__", (PyCFunction)BIIterContiguous_reversed, METH_NOARGS, NULL}, + {NULL}, +}; + + +static PyTypeObject BIIterContiguousType = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_basicsize = sizeof(BIIterContiguousObject), + .tp_dealloc = (destructor) BIIterContiguous_dealloc, + .tp_iter = (getiterfunc) BIIterContiguous_iter, + .tp_iternext = (iternextfunc) BIIterContiguous_iternext, + .tp_methods = BIIterContiguous_methods, + .tp_name = "arraykit.BlockIndexContiguousIterator", +}; + + //------------------------------------------------------------------------------ // NOTE: this constructor returns one of three different PyObject types. We do this to consolidate error reporting and type checks. +// The ascending argument is applied before consideration of a reverse iterator static PyObject * BIIterSelector_new(BlockIndexObject *bi, PyObject* selector, int8_t reversed, - BIIterSelectorKind kind - ) { + BIIterSelectorKind kind, + int8_t ascending) { int8_t is_array = 0; + int8_t incref_selector = 1; // incref borrowed selector; but if a new ref is made, do not + Py_ssize_t len = -1; Py_ssize_t pos = 0; Py_ssize_t stop = 0; @@ -4595,10 +4848,11 @@ BIIterSelector_new(BlockIndexObject *bi, return NULL; } len = PyArray_SIZE(a); + char k = PyArray_DESCR(a)->kind; if (kind == BIIS_UNKNOWN) { if (k == 'i' || k == 'u') { - kind = BIIS_SEQUENCE; + kind = BIIS_SEQ; } else if (k == 'b') { kind = BIIS_BOOLEAN; @@ -4608,7 +4862,7 @@ BIIterSelector_new(BlockIndexObject *bi, return NULL; } } - else if (kind == BIIS_SEQUENCE && k != 'i' && k != 'u') { + else if (kind == BIIS_SEQ && k != 'i' && k != 'u') { PyErr_SetString(PyExc_TypeError, "Arrays must integer kind"); return NULL; } @@ -4620,6 +4874,15 @@ BIIterSelector_new(BlockIndexObject *bi, PyErr_SetString(PyExc_TypeError, "Boolean arrays must match BlockIndex size"); return NULL; } + if (ascending) { + // NOTE: we can overwrite selector here as we have a borrowed refernce; sorting gives us a new reference, so we do not need to incref below + selector = PyArray_NewCopy(a, NPY_CORDER); + // sort in-place; can use a non-stable sort + if (PyArray_Sort((PyArrayObject*)selector, 0, NPY_QUICKSORT)) { + return NULL; // returns -1 on error + }; // new ref + incref_selector = 0; + } } else if (PySlice_Check(selector)) { if (kind == BIIS_UNKNOWN) { @@ -4629,10 +4892,18 @@ BIIterSelector_new(BlockIndexObject *bi, PyErr_SetString(PyExc_TypeError, "Slices cannot be used as selectors for this type of iterator"); return NULL; } + + if (ascending) { + // NOTE: we are abandoning the borrowed reference + selector = AK_slice_to_ascending_slice(selector, bi->bir_count); // new ref + incref_selector = 0; + } + if (PySlice_Unpack(selector, &pos, &stop, &step)) { return NULL; } len = PySlice_AdjustIndices(bi->bir_count, &pos, &stop, step); + if (reversed) { pos += (step * (len - 1)); step *= -1; @@ -4640,13 +4911,27 @@ BIIterSelector_new(BlockIndexObject *bi, } else if (PyList_CheckExact(selector)) { if (kind == BIIS_UNKNOWN) { - kind = BIIS_SEQUENCE; + kind = BIIS_SEQ; } - else if (kind != BIIS_SEQUENCE) { + else if (kind != BIIS_SEQ) { PyErr_SetString(PyExc_TypeError, "Lists cannot be used as for non-sequence iterators"); return NULL; } len = PyList_GET_SIZE(selector); + + if (ascending) { + // abandoning borrowed ref + selector = PyObject_CallMethod(selector, "copy", NULL); // new ref + if (selector == NULL) { + return NULL; + } + PyObject* post = PyObject_CallMethod(selector, "sort", NULL); // new ref + if (post == NULL) { + return NULL; + } + Py_DECREF(post); // just a None + incref_selector = 0; + } } else { PyErr_SetString(PyExc_TypeError, "Input type not supported"); @@ -4655,7 +4940,7 @@ BIIterSelector_new(BlockIndexObject *bi, PyObject *bii = NULL; switch (kind) { - case BIIS_SEQUENCE: { + case BIIS_SEQ: { BIIterSeqObject* it = PyObject_New(BIIterSeqObject, &BIIterSeqType); if (it == NULL) {goto error;} it->bi = bi; @@ -4681,7 +4966,7 @@ BIIterSelector_new(BlockIndexObject *bi, break; } case BIIS_BOOLEAN: { - BIIterBooleanObject* it = PyObject_New(BIIterBooleanObject, &BIIterBooleanType); + BIIterBooleanObject* it = PyObject_New(BIIterBooleanObject, &BIIterBoolType); if (it == NULL) {goto error;} it->bi = bi; it->selector = selector; @@ -4695,14 +4980,16 @@ BIIterSelector_new(BlockIndexObject *bi, goto error; // should not get here! } Py_INCREF(bi); - Py_INCREF(selector); + if (incref_selector) { + Py_INCREF(selector); + } return bii; -error: - // nothing shold be increfed when we get here +error: // nothing shold be increfed when we get here return NULL; } //------------------------------------------------------------------------------ +// block index new, init, memory // Returns 0 on succes, -1 on error. int @@ -4784,7 +5071,7 @@ BlockIndex_init(PyObject *self, PyObject *args, PyObject *kwargs) { bi->bir_count = bir_count; bi->bir_capacity = bir_capacity; - bi->shape_recache = 1; // always init to true + bi->shape_recache = true; // always init to true bi->shape = NULL; // Load the bi->bir struct array, if defined @@ -4811,7 +5098,6 @@ BlockIndex_init(PyObject *self, PyObject *args, PyObject *kwargs) { return -1; } } - return 0; } @@ -4826,16 +5112,7 @@ BlockIndex_dealloc(BlockIndexObject *self) { Py_TYPE(self)->tp_free((PyObject *)self); } -static PyObject * -BlockIndex_repr(BlockIndexObject *self) { - PyObject* dt = self->dtype == NULL ? Py_None : (PyObject*) self->dtype; - return PyUnicode_FromFormat("<%s(blocks: %i, rows: %i, columns: %i, dtype: %R)>", - Py_TYPE(self)->tp_name, - self->block_count, - self->row_count, - self->bir_count, - dt); -} +//------------------------------------------------------------------------------ // Returns NULL on error, True if the block should be reatained, False if the block has zero columns and should not be retained. This checks and raises on non-array inputs, dimensions other than 1 or 2, and mis-aligned columns. static PyObject * @@ -4873,7 +5150,7 @@ BlockIndex_register(BlockIndexObject *self, PyObject *value) { PyArray_Descr* dt = PyArray_DESCR(a); // borrowed ref - self->shape_recache = 1; // adjusting columns, must recache shape + self->shape_recache = true; // adjusting columns, must recache shape if (self->dtype == NULL) { // if not already set Py_INCREF((PyObject*)dt); @@ -4900,6 +5177,8 @@ BlockIndex_register(BlockIndexObject *self, PyObject *value) { Py_RETURN_TRUE; } +//------------------------------------------------------------------------------ +// exporters static PyObject* BlockIndex_to_list(BlockIndexObject *self, PyObject *Py_UNUSED(unused)) { @@ -4921,7 +5200,6 @@ BlockIndex_to_list(BlockIndexObject *self, PyObject *Py_UNUSED(unused)) { return list; } - // Returns NULL on error static PyObject* AK_BI_to_bytes(BlockIndexObject *self) { @@ -4931,13 +5209,14 @@ AK_BI_to_bytes(BlockIndexObject *self) { return bytes; } - // Returns NULL on error static PyObject* BlockIndex_to_bytes(BlockIndexObject *self, PyObject *Py_UNUSED(unused)) { return AK_BI_to_bytes(self); } +//------------------------------------------------------------------------------ +// pickle support // Returns NULL on error, PyObject* otherwise. static PyObject* @@ -4974,42 +5253,8 @@ BlockIndex_setstate(BlockIndexObject *self, PyObject *state) Py_RETURN_NONE; } - -static PyObject * -BlockIndex_copy(BlockIndexObject *self, PyObject *Py_UNUSED(unused)) -{ - PyTypeObject* cls = Py_TYPE(self); // borrowed ref - BlockIndexObject *bi = (BlockIndexObject *)cls->tp_alloc(cls, 0); - if (bi == NULL) { - return NULL; - } - bi->block_count = self->block_count; - bi->row_count = self->row_count; - bi->bir_count = self->bir_count; - bi->bir_capacity = self->bir_capacity; - - bi->shape_recache = 1; // could copy, but do not want to copy a pending cache state - bi->shape = NULL; - - bi->bir = NULL; - AK_BI_BIR_new(bi); // do initial alloc to self->bir_capacity - memcpy(bi->bir, - self->bir, - self->bir_count * sizeof(BlockIndexRecord)); - - bi->dtype = NULL; - if (self->dtype != NULL) { - bi->dtype = self->dtype; - Py_INCREF((PyObject*)bi->dtype); - } - return (PyObject *)bi; -} - -static PyObject* -BlockIndex_iter(BlockIndexObject* self) { - return BIIter_new(self, 0); -} - +//------------------------------------------------------------------------------ +// getters static PyObject * BlockIndex_shape_getter(BlockIndexObject *self, void* Py_UNUSED(closure)) @@ -5020,7 +5265,7 @@ BlockIndex_shape_getter(BlockIndexObject *self, void* Py_UNUSED(closure)) } // shape is not null and shape_recache is false Py_INCREF(self->shape); // for caller - self->shape_recache = 0; + self->shape_recache = false; return self->shape; } @@ -5045,7 +5290,6 @@ BlockIndex_dtype_getter(BlockIndexObject *self, void* Py_UNUSED(closure)){ return (PyObject*)PyArray_DescrFromType(NPY_FLOAT64); } - static struct PyGetSetDef BlockIndex_getset[] = { {"shape", (getter)BlockIndex_shape_getter, NULL, NULL, NULL}, {"rows", (getter)BlockIndex_rows_getter, NULL, NULL, NULL}, @@ -5054,6 +5298,49 @@ static struct PyGetSetDef BlockIndex_getset[] = { {NULL}, }; +//------------------------------------------------------------------------------ +// general methods + +static PyObject * +BlockIndex_repr(BlockIndexObject *self) { + PyObject* dt = self->dtype == NULL ? Py_None : (PyObject*) self->dtype; + return PyUnicode_FromFormat("<%s(blocks: %i, rows: %i, columns: %i, dtype: %R)>", + Py_TYPE(self)->tp_name, + self->block_count, + self->row_count, + self->bir_count, + dt); +} + +static PyObject * +BlockIndex_copy(BlockIndexObject *self, PyObject *Py_UNUSED(unused)) +{ + PyTypeObject* cls = Py_TYPE(self); // borrowed ref + BlockIndexObject *bi = (BlockIndexObject *)cls->tp_alloc(cls, 0); + if (bi == NULL) { + return NULL; + } + bi->block_count = self->block_count; + bi->row_count = self->row_count; + bi->bir_count = self->bir_count; + bi->bir_capacity = self->bir_capacity; + + bi->shape_recache = true; // could copy, but do not want to copy a pending cache state + bi->shape = NULL; + + bi->bir = NULL; + AK_BI_BIR_new(bi); // do initial alloc to self->bir_capacity + memcpy(bi->bir, + self->bir, + self->bir_count * sizeof(BlockIndexRecord)); + + bi->dtype = NULL; + if (self->dtype != NULL) { + bi->dtype = self->dtype; + Py_INCREF((PyObject*)bi->dtype); + } + return (PyObject *)bi; +} static Py_ssize_t BlockIndex_length(BlockIndexObject *self){ @@ -5100,13 +5387,55 @@ BlockIndex_get_column(BlockIndexObject *self, PyObject *key){ return NULL; } +//------------------------------------------------------------------------------ +// iterators + +static PyObject* +BlockIndex_iter(BlockIndexObject* self) { + return BIIter_new(self, 0); +} // Given key, return an iterator of a selection. static PyObject* BlockIndex_iter_select(BlockIndexObject *self, PyObject *selector){ - return BIIterSelector_new(self, selector, 0, BIIS_UNKNOWN); + return BIIterSelector_new(self, selector, 0, BIIS_UNKNOWN, 0); } +static char *iter_contiguous_kargs_names[] = { + "selector", + "ascending", + "reduce", + NULL +}; + +// Given key, return an iterator of a selection. +static PyObject* +BlockIndex_iter_contiguous(BlockIndexObject *self, PyObject *args, PyObject *kwargs) +{ + PyObject* selector; + int ascending = 0; + int reduce = 0; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, + "O|$pp:iter_contiguous", + iter_contiguous_kargs_names, + &selector, + &ascending, + &reduce + )) { + return NULL; + } + + // might need to store enum type for branching + PyObject* iter = BIIterSelector_new(self, selector, 0, BIIS_UNKNOWN, ascending); + PyObject* biiter = BIIterContiguous_new(self, 0, iter, reduce); // will incref iter + Py_DECREF(iter); + + return biiter; +} + +//------------------------------------------------------------------------------ +// slot / method def static PySequenceMethods BlockIndex_as_sequece = { .sq_length = (lenfunc)BlockIndex_length, @@ -5125,6 +5454,10 @@ static PyMethodDef BlockIndex_methods[] = { {"get_block", (PyCFunction) BlockIndex_get_block, METH_O, NULL}, {"get_column", (PyCFunction) BlockIndex_get_column, METH_O, NULL}, {"iter_select", (PyCFunction) BlockIndex_iter_select, METH_O, NULL}, + {"iter_contiguous", + (PyCFunction) BlockIndex_iter_contiguous, + METH_VARARGS | METH_KEYWORDS, + NULL}, // {"__getnewargs__", (PyCFunction)BlockIndex_getnewargs, METH_NOARGS, NULL}, {NULL}, }; @@ -5501,7 +5834,7 @@ PyInit__arraykit(void) PyType_Ready(&BIIterType) || PyType_Ready(&BIIterSeqType) || PyType_Ready(&BIIterSliceType) || - PyType_Ready(&BIIterBooleanType) || + PyType_Ready(&BIIterBoolType) || PyType_Ready(&ArrayGOType) || PyModule_AddObject(m, "BlockIndex", (PyObject *) &BlockIndexType) || PyModule_AddObject(m, "ArrayGO", (PyObject *) &ArrayGOType) || diff --git a/test/test_block_index.py b/test/test_block_index.py index 11eea8b2..9d52dd37 100644 --- a/test/test_block_index.py +++ b/test/test_block_index.py @@ -531,6 +531,7 @@ def test_block_index_iter_select_boolean_c(self) -> None: ) #--------------------------------------------------------------------------- + def test_block_index_iter_select_sequence_a(self) -> None: bi1 = BlockIndex() bi1.register(np.arange(4).reshape(2,2)) @@ -551,4 +552,156 @@ def test_block_index_iter_select_sequence_b(self) -> None: bi1.register(np.arange(10).reshape(2,5)) with self.assertRaises(IndexError): - _ = list(bi1.iter_select([-9])) \ No newline at end of file + _ = list(bi1.iter_select([-9])) + + + + def test_block_index_iter_select_sequence_c(self) -> None: + bi1 = BlockIndex() + bi1.register(np.arange(4).reshape(2,2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(10).reshape(2,5)) + + with self.assertRaises(TypeError): + _ = list(bi1.iter_select(['b', 'c'])) + + #--------------------------------------------------------------------------- + + def test_block_index_iter_contiguous_a(self) -> None: + bi1 = BlockIndex() + bi1.register(np.arange(6).reshape(2,3)) + bi1.register(np.arange(2)) + bi1.register(np.arange(6).reshape(2,3)) + bi1.register(np.arange(2)) + + self.assertEqual( + list(bi1.iter_contiguous([1,2,6,7])), + [(0, slice(1, 3, None)), (2, slice(2, 3, None)), (3, slice(0, 1, None))] + ) + + self.assertEqual( + list(bi1.iter_contiguous([7,6,2,1])), + [(3, slice(0, 1, None)), (2, slice(2, 3, None)), (0, slice(2, 0, -1))] + ) + + self.assertEqual( + list(bi1.iter_contiguous([7, 6, 2, 1], ascending=True)), + [(0, slice(1, 3, None)), (2, slice(2, 3, None)), (3, slice(0, 1, None))] + ) + + + def test_block_index_iter_contiguous_b(self) -> None: + bi1 = BlockIndex() + bi1.register(np.arange(16).reshape(2,8)) + + self.assertEqual( + list(bi1.iter_contiguous([0,1,6,7])), + [(0, slice(0, 2, None)), (0, slice(6, 8, None))] + ) + self.assertEqual( + list(bi1.iter_contiguous(slice(None))), + [(0, slice(0, 8, None))] + ) + self.assertEqual( + list(bi1.iter_contiguous(slice(1, 6))), + [(0, slice(1, 6, None))] + ) + self.assertEqual( + list(bi1.iter_contiguous(slice(0, 8, 3))), + [(0, slice(0, 1, None)), (0, slice(3, 4, None)), (0, slice(6, 7, None))] + ) + self.assertEqual( + list(bi1.iter_contiguous(slice(0, 8, 3), reduce=True)), + [(0, 0), (0, 3), (0, 6)] + ) + + def test_block_index_iter_contiguous_c(self) -> None: + bi1 = BlockIndex() + bi1.register(np.arange(16).reshape(2,8)) + + with self.assertRaises(TypeError): + list(bi1.iter_contiguous([0,1,6,7], False)) + + + def test_block_index_iter_contiguous_d(self) -> None: + bi1 = BlockIndex() + bi1.register(np.arange(8).reshape(2,4)) + bi1.register(np.arange(8).reshape(2,4)) + + self.assertEqual( + list(bi1.iter_contiguous(slice(7,1,-1))), + [(1, slice(3, None, -1)), (0, slice(3, 1, -1))] + ) + + self.assertEqual( + list(bi1.iter_contiguous(slice(7,1,-1), ascending=True)), + [(0, slice(2, 4)), (1, slice(0, 4))] + ) + + self.assertEqual( + list(bi1.iter_contiguous(slice(8,1,-1), ascending=True)), + [(0, slice(2, 4)), (1, slice(0, 4))] + ) + + self.assertEqual( + list(bi1.iter_contiguous(slice(8,None,-1), ascending=True)), + [(0, slice(0, 4)), (1, slice(0, 4))] + ) + + def test_block_index_iter_contiguous_e1(self) -> None: + bi1 = BlockIndex() + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + + self.assertEqual( + list(bi1.iter_contiguous([6, 0, 7])), + [(6, slice(0, 1)), (0, slice(0, 1)), (7, slice(0, 1))] + ) + self.assertEqual( + list(bi1.iter_contiguous([6, 0, 7], ascending=True)), + [(0, slice(0, 1)), (6, slice(0, 1)), (7, slice(0, 1))] + ) + + self.assertEqual( + list(bi1.iter_contiguous(np.array([6, 0, 7]))), + [(6, slice(0, 1)), (0, slice(0, 1)), (7, slice(0, 1))] + ) + self.assertEqual( + list(bi1.iter_contiguous(np.array([6, 0, 7]), ascending=True)), + [(0, slice(0, 1)), (6, slice(0, 1)), (7, slice(0, 1))] + ) + + def test_block_index_iter_contiguous_e2(self) -> None: + bi1 = BlockIndex() + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + bi1.register(np.arange(2)) + + self.assertEqual( + list(bi1.iter_contiguous([6, 0, 7], reduce=True)), + [(6, 0), (0, 0), (7, 0)] + ) + self.assertEqual( + list(bi1.iter_contiguous([6, 0, 7], ascending=True, reduce=True)), + [(0, 0), (6, 0), (7, 0)] + ) + + self.assertEqual( + list(bi1.iter_contiguous(np.array([6, 0, 7]), reduce=True)), + [(6, 0), (0, 0), (7, 0)] + ) + self.assertEqual( + list(bi1.iter_contiguous(np.array([6, 0, 7]), ascending=True, reduce=True)), + [(0, 0), (6, 0), (7, 0)] + ) diff --git a/test/test_util.py b/test/test_util.py index 43456c8b..b222172e 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -708,7 +708,7 @@ def test_first_true_2d_h(self) -> None: def test_slice_to_ascending_slice_a(self) -> None: self.assertEqual(slice_to_ascending_slice( slice(5, 2, -1), 6), - slice(3, 6, 1), + slice(3, 6, None), ) def test_slice_to_ascending_slice_b(self) -> None: