Skip to content

Commit

Permalink
ENH/FIX: Allow very large values in relabel_sequential (#4612)
Browse files Browse the repository at this point in the history
* Adding first draft of Cython relabel_sequential
Co-authored-by: Juan Nunez-Iglesias <juan.nunez-iglesias@monash.edu>

* fixed order of args for map_array

* Add ArrayMap str and repr

* Add __call__ method to ArrayMap object

* Add missing colon

* Add array representation for ArrayMap

* Use array representation to preserve relabel_sequential doctests

* Restore ValueError checks for relabel_sequential

* Fix PEP8 issues

* Ensure 0 always maps to 0

* Ensure dtype is correct for output

* Fix incorrect dtype for array being relabeled

The docs specify that only integer arrays are supported. This change
was added in gh-811 without any discussion so imho safe to revert.

* Fix dtype of output and map

* Inverse map dtype should actually map the input dtype

* Fix docstring for map_array, check for contiguous out

* Add docstring for ArrayMap

* Rename attributes in ArrayMap

* Add C++0x standard to the compiler options

* added missing paren in docstring

* mention large size of returned array in __array__ docstr

* adjusted sample array to fix doctest

Co-authored-by: Juan Nunez-Iglesias <juan.nunez-iglesias@monash.edu>

* added tests for dtype overflow

* test relabel_seq... works for large input

* Simplify checking of contiguous array

Co-Authored-By: Riadh Fezzani <rfezzani@gmail.com>

* Minor code updates from review

Co-Authored-By: Riadh Fezzani <rfezzani@gmail.com>

* Restore reshaping of output array when provided

* Add note about -std=c++0x flag

* Ensure reshape is zero-copy

* Clarify error message when output has incorrect shape

* Remove unused numpy import

* Update relabel_sequential docstring to indicate new output types

* Raise error if the input array has non-integer type

* Ensure arrays being mapped and value arrays have same dtype

* Add doctests for ArrayMap.__str__ and fix

* Rework logic for in-place reshaping

* Add test for error when labels dtype is not int

* Add test for incorrect map_array output shape

* Test discontiguous output array

* Add test for long reprs for ArrayMap

* Add test for calling ArrayMap

* Remove long-deprecated relabel_from_one

* Add len to ArrayMap

* Add support for setitem and scalar mapping

* Add support for slicing and updating map in-place

* Allow bool indices in arraymap getitem

Co-authored-by: Volker Hilsenstein <volker.hilsenstein@monash.edu>
Co-authored-by: Juan Nunez-Iglesias <juan.nunez-iglesias@monash.edu>
Co-authored-by: Riadh Fezzani <rfezzani@gmail.com>
  • Loading branch information
4 people committed May 4, 2020
1 parent 06aa947 commit 4bff390
Show file tree
Hide file tree
Showing 5 changed files with 364 additions and 38 deletions.
3 changes: 1 addition & 2 deletions skimage/segmentation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ._quickshift import quickshift
from .boundaries import find_boundaries, mark_boundaries
from ._clear_border import clear_border
from ._join import join_segmentations, relabel_from_one, relabel_sequential
from ._join import join_segmentations, relabel_sequential
from ._watershed import watershed
from ._chan_vese import chan_vese
from .morphsnakes import (morphological_geodesic_active_contour,
Expand All @@ -24,7 +24,6 @@
'mark_boundaries',
'clear_border',
'join_segmentations',
'relabel_from_one',
'relabel_sequential',
'watershed',
'chan_vese',
Expand Down
265 changes: 232 additions & 33 deletions skimage/segmentation/_join.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
from .._shared.utils import deprecated
from ._remap import _map_array


def join_segmentations(s1, s2):
Expand Down Expand Up @@ -43,15 +44,6 @@ def join_segmentations(s1, s2):
return j


@deprecated('relabel_sequential')
def relabel_from_one(label_field):
"""Convert labels in an arbitrary label field to {1, ... number_of_labels}.
This function is deprecated, see ``relabel_sequential`` for more.
"""
return relabel_sequential(label_field, offset=1)


def relabel_sequential(label_field, offset=1):
"""Relabel arbitrary labels to {`offset`, ... `offset` + number_of_labels}.
Expand All @@ -74,14 +66,14 @@ def relabel_sequential(label_field, offset=1):
{offset, ..., number_of_labels + offset - 1}.
The data type will be the same as `label_field`, except when
offset + number_of_labels causes overflow of the current data type.
forward_map : numpy array of int, shape ``(label_field.max() + 1,)``
forward_map : ArrayMap
The map from the original label space to the returned label
space. Can be used to re-apply the same mapping. See examples
for usage. The data type will be the same as `relabeled`.
inverse_map : 1D numpy array of int, of length offset + number of labels
for usage. The output data type will be the same as `relabeled`.
inverse_map : ArrayMap
The map from the new label space to the original space. This
can be used to reconstruct the original label field from the
relabeled one. The data type will be the same as `relabeled`.
relabeled one. The output data type will be the same as `label_field`.
Notes
-----
Expand All @@ -100,13 +92,20 @@ def relabel_sequential(label_field, offset=1):
>>> relab, fw, inv = relabel_sequential(label_field)
>>> relab
array([1, 1, 2, 2, 3, 5, 4])
>>> fw
>>> print(fw)
ArrayMap:
1 → 1
5 → 2
8 → 3
42 → 4
99 → 5
>>> np.array(fw)
array([0, 1, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5])
>>> inv
>>> np.array(inv)
array([ 0, 1, 5, 8, 42, 99])
>>> (fw[label_field] == relab).all()
True
Expand All @@ -116,26 +115,226 @@ def relabel_sequential(label_field, offset=1):
>>> relab
array([5, 5, 6, 6, 7, 9, 8])
"""
offset = int(offset)
if offset <= 0:
raise ValueError("Offset must be strictly positive.")
if np.min(label_field) < 0:
raise ValueError("Cannot relabel array that contains negative values.")
max_label = int(label_field.max()) # Ensure max_label is an integer
if not np.issubdtype(label_field.dtype, np.integer):
new_type = np.min_scalar_type(max_label)
label_field = label_field.astype(new_type)
labels = np.unique(label_field)
labels0 = labels[labels != 0]
new_max_label = offset - 1 + len(labels0)
new_labels0 = np.arange(offset, new_max_label + 1)
output_type = label_field.dtype
required_type = np.min_scalar_type(new_max_label)
if np.dtype(required_type).itemsize > np.dtype(label_field.dtype).itemsize:
offset = int(offset)
in_vals = np.unique(label_field)
if in_vals[0] == 0:
# always map 0 to 0
out_vals = np.concatenate(
[[0], np.arange(offset, offset+len(in_vals)-1)]
)
else:
out_vals = np.arange(offset, offset+len(in_vals))
input_type = label_field.dtype

# Some logic to determine the output type:
# - we don't want to return a smaller output type than the input type,
# ie if we get uint32 as labels input, don't return a uint8 array.
# - but, in some cases, using the input type could result in overflow. The
# input type could be a signed integer (e.g. int32) but
# `np.min_scalar_type` will always return an unsigned type. We check for
# that by casting the largest output value to the input type. If it is
# unchanged, we use the input type, else we use the unsigned minimum
# required type
required_type = np.min_scalar_type(out_vals[-1])
if input_type.itemsize < required_type.itemsize:
output_type = required_type
forward_map = np.zeros(max_label + 1, dtype=output_type)
forward_map[labels0] = new_labels0
inverse_map = np.zeros(new_max_label + 1, dtype=output_type)
inverse_map[offset:] = labels0
relabeled = forward_map[label_field]
return relabeled, forward_map, inverse_map
else:
if input_type.type(out_vals[-1]) == out_vals[-1]:
output_type = input_type
else:
output_type = required_type
out_array = np.empty(label_field.shape, dtype=output_type)
out_vals = out_vals.astype(output_type)
map_array(label_field, in_vals, out_vals, out=out_array)
fw_map = ArrayMap(in_vals, out_vals)
inv_map = ArrayMap(out_vals, in_vals)
return out_array, fw_map, inv_map


def map_array(input_arr, input_vals, output_vals, out=None):
"""Map values from input array from input_vals to output_vals.
Parameters
----------
input_arr : array of int, shape (M[, N][, P][, ...])
The input label image.
input_vals : array of int, shape (N,)
The values to map from.
output_vals : array, shape (N,)
The values to map to.
out: array, same shape as `input_arr`
The output array. Will be created if not provided. It should
have the same dtype as `output_vals`.
Returns
-------
out : array, same shape as `input_arr`
The array of mapped values.
"""

if not np.issubdtype(input_arr.dtype, np.integer):
raise TypeError(
'The dtype of an array to be remapped should be integer.'
)
# We ravel the input array for simplicity of iteration in Cython:
orig_shape = input_arr.shape
# NumPy docs for `np.ravel()` says:
# "When a view is desired in as many cases as possible,
# arr.reshape(-1) may be preferable."
input_arr = input_arr.reshape(-1)
if out is None:
out = np.empty(orig_shape, dtype=output_vals.dtype)
elif out.shape != orig_shape:
raise ValueError(
'If out array is provided, it should have the same shape as '
f'the input array. Input array has shape {orig_shape}, provided '
f'output array has shape {out.shape}.'
)
try:
out_view = out.view()
out_view.shape = (-1,) # no-copy reshape/ravel
except AttributeError: # if out strides are not compatible with 0-copy
raise ValueError(
'If out array is provided, it should be either contiguous '
f'or 1-dimensional. Got array with shape {out.shape} and '
f'strides {out.strides}.'
)

# ensure all arrays have matching types before sending to Cython
input_vals = input_vals.astype(input_arr.dtype, copy=False)
output_vals = output_vals.astype(out.dtype, copy=False)
_map_array(input_arr, out_view, input_vals, output_vals)
return out


class ArrayMap:
"""Class designed to mimic mapping by NumPy array indexing.
This class is designed to replicate the use of NumPy arrays for mapping
values with indexing:
>>> values = np.array([0.25, 0.5, 1.0])
>>> indices = np.array([[0, 0, 1], [2, 2, 1]])
>>> values[indices]
array([[0.25, 0.25, 0.5 ],
[1. , 1. , 0.5 ]])
The issue with this indexing is that you need a very large ``values``
array if the values in the ``indices`` array are large.
>>> values = np.array([0.25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0])
>>> indices = np.array([[0, 0, 10], [0, 10, 10]])
>>> values[indices]
array([[0.25, 0.25, 1. ],
[0.25, 1. , 1. ]])
Using this class, the approach is similar, but there is no need to
create a large values array:
>>> in_indices = np.array([0, 10])
>>> out_values = np.array([0.25, 1.0])
>>> values = ArrayMap(in_indices, out_values)
>>> values
ArrayMap(array([ 0, 10]), array([0.25, 1. ]))
>>> print(values)
ArrayMap:
0 → 0.25
10 → 1.0
>>> indices = np.array([[0, 0, 10], [0, 10, 10]])
>>> values[indices]
array([[0.25, 0.25, 1. ],
[0.25, 1. , 1. ]])
Parameters
----------
in_values : array of int, shape (N,)
The source values from which to map.
out_values : array, shape (N,)
The destination values from which to map.
"""
def __init__(self, in_values, out_values):
self.in_values = in_values
self.out_values = out_values
self._max_str_lines = 4
self._array = None

def __len__(self):
"""Return one more than the maximum label value being remapped."""
return np.max(self.in_values) + 1

def __array__(self, dtype=None):
"""Return an array that behaves like the arraymap when indexed.
This array can be very large: it is the size of the largest value
in the ``in_vals`` array, plus one.
"""
if dtype is None:
dtype = self.out_values.dtype
output = np.zeros(np.max(self.in_values) + 1, dtype=dtype)
output[self.in_values] = self.out_values
return output

@property
def dtype(self):
return self.out_values.dtype

def __repr__(self):
return f'ArrayMap({repr(self.in_values)}, {repr(self.out_values)})'

def __str__(self):
if len(self.in_values) <= self._max_str_lines + 1:
rows = range(len(self.in_values))
string = '\n'.join(
['ArrayMap:'] +
[f' {self.in_values[i]}{self.out_values[i]}' for i in rows]
)
else:
rows0 = list(range(0, self._max_str_lines // 2))
rows1 = list(range(-self._max_str_lines // 2, 0))
string = '\n'.join(
['ArrayMap:'] +
[f' {self.in_values[i]}{self.out_values[i]}'
for i in rows0] +
[' ...'] +
[f' {self.in_values[i]}{self.out_values[i]}'
for i in rows1]
)
return string

def __call__(self, arr):
return self.__getitem__(arr)

def __getitem__(self, index):
scalar = np.isscalar(index)
if scalar:
index = np.array([index])
elif isinstance(index, slice):
start = index.start or 0 # treat None or 0 the same way
stop = (index.stop
if index.stop is not None
else len(self))
step = index.step
index = np.arange(start, stop, step)
if index.dtype == bool:
index = np.flatnonzero(index)

out = map_array(
index,
self.in_values.astype(index.dtype, copy=False),
self.out_values,
)

if scalar:
out = out[0]
return out

def __setitem__(self, indices, values):
if self._array is None:
self._array = self.__array__()
self._array[indices] = values
self.in_values = np.flatnonzero(self._array)
self.out_values = self._array[self.in_values]
24 changes: 24 additions & 0 deletions skimage/segmentation/_remap.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# distutils: language = c++

from libcpp.unordered_map cimport unordered_map
cimport cython
from .._shared.fused_numerics cimport np_numeric, np_anyint

@cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False) # Deactivate negative indexing
def _map_array(np_anyint[:] inarr, np_numeric[:] outarr,
np_anyint[:] inval, np_numeric[:] outval):
# build the map from the input and output vectors
cdef size_t i, n_map, n_array
cdef unordered_map[np_anyint, np_numeric] lut
n_map = inval.shape[0]
for i in range(n_map):
lut[inval[i]] = outval[i]
# apply the map to the array
n_array = inarr.shape[0]
# The prange option gave some compilation warnings
# "Unsigned index type not allowed before OpenMP 3.0"
# and didn't seem to be any faster
# for i in prange(n_array, nogil=True): #
for i in range(n_array):
outarr[i] = lut[inarr[i]]
9 changes: 8 additions & 1 deletion skimage/segmentation/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ def configuration(parent_package='', top_path=None):
cython(['_watershed_cy.pyx',
'_felzenszwalb_cy.pyx',
'_quickshift_cy.pyx',
'_slic.pyx'], working_path=base_path)
'_slic.pyx',
'_remap.pyx'], working_path=base_path)
config.add_extension('_watershed_cy', sources=['_watershed_cy.c'],
include_dirs=[get_numpy_include_dirs()])
config.add_extension('_felzenszwalb_cy', sources=['_felzenszwalb_cy.c'],
Expand All @@ -23,6 +24,12 @@ def configuration(parent_package='', top_path=None):
include_dirs=[get_numpy_include_dirs()])
config.add_extension('_slic', sources=['_slic.c'],
include_dirs=[get_numpy_include_dirs()])
# note: the extra compiler flag -std=c++0x is needed to access the
# std::unordered_map container on some earlier gcc compilers. See:
# https://stackoverflow.com/a/3973692/224254
config.add_extension('_remap', sources='_remap.cpp',
include_dirs=[get_numpy_include_dirs()],
language='c++', extra_compile_args=['-std=c++0x'])

return config

Expand Down

0 comments on commit 4bff390

Please sign in to comment.