ENH/FIX: Allow very large values in relabel_sequential (#4612)

* Adding first draft of Cython relabel_sequential Co-authored-by: Juan Nunez-Iglesias <juan.nunez-iglesias@monash.edu> * fixed order of args for map_array * Add ArrayMap str and repr * Add __call__ method to ArrayMap object * Add missing colon * Add array representation for ArrayMap * Use array representation to preserve relabel_sequential doctests * Restore ValueError checks for relabel_sequential * Fix PEP8 issues * Ensure 0 always maps to 0 * Ensure dtype is correct for output * Fix incorrect dtype for array being relabeled The docs specify that only integer arrays are supported. This change was added in gh-811 without any discussion so imho safe to revert. * Fix dtype of output and map * Inverse map dtype should actually map the input dtype * Fix docstring for map_array, check for contiguous out * Add docstring for ArrayMap * Rename attributes in ArrayMap * Add C++0x standard to the compiler options * added missing paren in docstring * mention large size of returned array in __array__ docstr * adjusted sample array to fix doctest Co-authored-by: Juan Nunez-Iglesias <juan.nunez-iglesias@monash.edu> * added tests for dtype overflow * test relabel_seq... works for large input * Simplify checking of contiguous array Co-Authored-By: Riadh Fezzani <rfezzani@gmail.com> * Minor code updates from review Co-Authored-By: Riadh Fezzani <rfezzani@gmail.com> * Restore reshaping of output array when provided * Add note about -std=c++0x flag * Ensure reshape is zero-copy * Clarify error message when output has incorrect shape * Remove unused numpy import * Update relabel_sequential docstring to indicate new output types * Raise error if the input array has non-integer type * Ensure arrays being mapped and value arrays have same dtype * Add doctests for ArrayMap.__str__ and fix * Rework logic for in-place reshaping * Add test for error when labels dtype is not int * Add test for incorrect map_array output shape * Test discontiguous output array * Add test for long reprs for ArrayMap * Add test for calling ArrayMap * Remove long-deprecated relabel_from_one * Add len to ArrayMap * Add support for setitem and scalar mapping * Add support for slicing and updating map in-place * Allow bool indices in arraymap getitem Co-authored-by: Volker Hilsenstein <volker.hilsenstein@monash.edu> Co-authored-by: Juan Nunez-Iglesias <juan.nunez-iglesias@monash.edu> Co-authored-by: Riadh Fezzani <rfezzani@gmail.com>
scikit-image · May 4, 2020 · 4bff390 · 4bff390
1 parent 06aa947
commit 4bff390
Show file tree

Hide file tree

Showing 5 changed files with 364 additions and 38 deletions.
diff --git a/skimage/segmentation/__init__.py b/skimage/segmentation/__init__.py
@@ -5,7 +5,7 @@
 from ._quickshift import quickshift
 from .boundaries import find_boundaries, mark_boundaries
 from ._clear_border import clear_border
-from ._join import join_segmentations, relabel_from_one, relabel_sequential
+from ._join import join_segmentations, relabel_sequential
 from ._watershed import watershed
 from ._chan_vese import chan_vese
 from .morphsnakes import (morphological_geodesic_active_contour,
@@ -24,7 +24,6 @@
            'mark_boundaries',
            'clear_border',
            'join_segmentations',
-           'relabel_from_one',
            'relabel_sequential',
            'watershed',
            'chan_vese',

diff --git a/skimage/segmentation/_join.py b/skimage/segmentation/_join.py
@@ -1,5 +1,6 @@
 import numpy as np
 from .._shared.utils import deprecated
+from ._remap import _map_array
 
 
 def join_segmentations(s1, s2):
@@ -43,15 +44,6 @@ def join_segmentations(s1, s2):
     return j
 
 
-@deprecated('relabel_sequential')
-def relabel_from_one(label_field):
-    """Convert labels in an arbitrary label field to {1, ... number_of_labels}.
-
-    This function is deprecated, see ``relabel_sequential`` for more.
-    """
-    return relabel_sequential(label_field, offset=1)
-
-
 def relabel_sequential(label_field, offset=1):
     """Relabel arbitrary labels to {`offset`, ... `offset` + number_of_labels}.
 
@@ -74,14 +66,14 @@ def relabel_sequential(label_field, offset=1):
         {offset, ..., number_of_labels + offset - 1}.
         The data type will be the same as `label_field`, except when
         offset + number_of_labels causes overflow of the current data type.
-    forward_map : numpy array of int, shape ``(label_field.max() + 1,)``
+    forward_map : ArrayMap
         The map from the original label space to the returned label
         space. Can be used to re-apply the same mapping. See examples
-        for usage. The data type will be the same as `relabeled`.
-    inverse_map : 1D numpy array of int, of length offset + number of labels
+        for usage. The output data type will be the same as `relabeled`.
+    inverse_map : ArrayMap
         The map from the new label space to the original space. This
         can be used to reconstruct the original label field from the
-        relabeled one. The data type will be the same as `relabeled`.
+        relabeled one. The output data type will be the same as `label_field`.
 
     Notes
     -----
@@ -100,13 +92,20 @@ def relabel_sequential(label_field, offset=1):
     >>> relab, fw, inv = relabel_sequential(label_field)
     >>> relab
     array([1, 1, 2, 2, 3, 5, 4])
-    >>> fw
+    >>> print(fw)
+    ArrayMap:
+      1 → 1
+      5 → 2
+      8 → 3
+      42 → 4
+      99 → 5
+    >>> np.array(fw)
     array([0, 1, 0, 0, 0, 2, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5])
-    >>> inv
+    >>> np.array(inv)
     array([ 0,  1,  5,  8, 42, 99])
     >>> (fw[label_field] == relab).all()
     True
@@ -116,26 +115,226 @@ def relabel_sequential(label_field, offset=1):
     >>> relab
     array([5, 5, 6, 6, 7, 9, 8])
     """
-    offset = int(offset)
     if offset <= 0:
         raise ValueError("Offset must be strictly positive.")
     if np.min(label_field) < 0:
         raise ValueError("Cannot relabel array that contains negative values.")
-    max_label = int(label_field.max()) # Ensure max_label is an integer
-    if not np.issubdtype(label_field.dtype, np.integer):
-        new_type = np.min_scalar_type(max_label)
-        label_field = label_field.astype(new_type)
-    labels = np.unique(label_field)
-    labels0 = labels[labels != 0]
-    new_max_label = offset - 1 + len(labels0)
-    new_labels0 = np.arange(offset, new_max_label + 1)
-    output_type = label_field.dtype
-    required_type = np.min_scalar_type(new_max_label)
-    if np.dtype(required_type).itemsize > np.dtype(label_field.dtype).itemsize:
+    offset = int(offset)
+    in_vals = np.unique(label_field)
+    if in_vals[0] == 0:
+        # always map 0 to 0
+        out_vals = np.concatenate(
+            [[0], np.arange(offset, offset+len(in_vals)-1)]
+        )
+    else:
+        out_vals = np.arange(offset, offset+len(in_vals))
+    input_type = label_field.dtype
+
+    # Some logic to determine the output type:
+    #  - we don't want to return a smaller output type than the input type,
+    #  ie if we get uint32 as labels input, don't return a uint8 array.
+    #  - but, in some cases, using the input type could result in overflow. The
+    #  input type could be a signed integer (e.g. int32) but
+    #  `np.min_scalar_type` will always return an unsigned type. We check for
+    #  that by casting the largest output value to the input type. If it is
+    #  unchanged, we use the input type, else we use the unsigned minimum
+    #  required type
+    required_type = np.min_scalar_type(out_vals[-1])
+    if input_type.itemsize < required_type.itemsize:
         output_type = required_type
-    forward_map = np.zeros(max_label + 1, dtype=output_type)
-    forward_map[labels0] = new_labels0
-    inverse_map = np.zeros(new_max_label + 1, dtype=output_type)
-    inverse_map[offset:] = labels0
-    relabeled = forward_map[label_field]
-    return relabeled, forward_map, inverse_map
+    else:
+        if input_type.type(out_vals[-1]) == out_vals[-1]:
+            output_type = input_type
+        else:
+            output_type = required_type
+    out_array = np.empty(label_field.shape, dtype=output_type)
+    out_vals = out_vals.astype(output_type)
+    map_array(label_field, in_vals, out_vals, out=out_array)
+    fw_map = ArrayMap(in_vals, out_vals)
+    inv_map = ArrayMap(out_vals, in_vals)
+    return out_array, fw_map, inv_map
+
+
+def map_array(input_arr, input_vals, output_vals, out=None):
+    """Map values from input array from input_vals to output_vals.
+
+    Parameters
+    ----------
+    input_arr : array of int, shape (M[, N][, P][, ...])
+        The input label image.
+    input_vals : array of int, shape (N,)
+        The values to map from.
+    output_vals : array, shape (N,)
+        The values to map to.
+    out: array, same shape as `input_arr`
+        The output array. Will be created if not provided. It should
+        have the same dtype as `output_vals`.
+
+    Returns
+    -------
+    out : array, same shape as `input_arr`
+        The array of mapped values.
+    """
+
+    if not np.issubdtype(input_arr.dtype, np.integer):
+        raise TypeError(
+            'The dtype of an array to be remapped should be integer.'
+        )
+    # We ravel the input array for simplicity of iteration in Cython:
+    orig_shape = input_arr.shape
+    # NumPy docs for `np.ravel()` says:
+    # "When a view is desired in as many cases as possible, 
+    # arr.reshape(-1) may be preferable."
+    input_arr = input_arr.reshape(-1)
+    if out is None:
+        out = np.empty(orig_shape, dtype=output_vals.dtype)
+    elif out.shape != orig_shape:
+        raise ValueError(
+            'If out array is provided, it should have the same shape as '
+            f'the input array. Input array has shape {orig_shape}, provided '
+            f'output array has shape {out.shape}.'
+        )
+    try:
+        out_view = out.view()
+        out_view.shape = (-1,)  # no-copy reshape/ravel
+    except AttributeError:  # if out strides are not compatible with 0-copy
+        raise ValueError(
+            'If out array is provided, it should be either contiguous '
+            f'or 1-dimensional. Got array with shape {out.shape} and '
+            f'strides {out.strides}.'
+        )
+
+    # ensure all arrays have matching types before sending to Cython
+    input_vals = input_vals.astype(input_arr.dtype, copy=False)
+    output_vals = output_vals.astype(out.dtype, copy=False)
+    _map_array(input_arr, out_view, input_vals, output_vals)
+    return out
+
+
+class ArrayMap:
+    """Class designed to mimic mapping by NumPy array indexing.
+
+    This class is designed to replicate the use of NumPy arrays for mapping
+    values with indexing:
+
+    >>> values = np.array([0.25, 0.5, 1.0])
+    >>> indices = np.array([[0, 0, 1], [2, 2, 1]])
+    >>> values[indices]
+    array([[0.25, 0.25, 0.5 ],
+           [1.  , 1.  , 0.5 ]])
+
+    The issue with this indexing is that you need a very large ``values``
+    array if the values in the ``indices`` array are large.
+
+    >>> values = np.array([0.25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.0])
+    >>> indices = np.array([[0, 0, 10], [0, 10, 10]])
+    >>> values[indices]
+    array([[0.25, 0.25, 1.  ],
+           [0.25, 1.  , 1.  ]])
+
+    Using this class, the approach is similar, but there is no need to
+    create a large values array:
+
+    >>> in_indices = np.array([0, 10])
+    >>> out_values = np.array([0.25, 1.0])
+    >>> values = ArrayMap(in_indices, out_values)
+    >>> values
+    ArrayMap(array([ 0, 10]), array([0.25, 1.  ]))
+    >>> print(values)
+    ArrayMap:
+      0 → 0.25
+      10 → 1.0
+    >>> indices = np.array([[0, 0, 10], [0, 10, 10]])
+    >>> values[indices]
+    array([[0.25, 0.25, 1.  ],
+           [0.25, 1.  , 1.  ]])
+
+    Parameters
+    ----------
+    in_values : array of int, shape (N,)
+        The source values from which to map.
+    out_values : array, shape (N,)
+        The destination values from which to map.
+    """
+    def __init__(self, in_values, out_values):
+        self.in_values = in_values
+        self.out_values = out_values
+        self._max_str_lines = 4
+        self._array = None
+
+    def __len__(self):
+        """Return one more than the maximum label value being remapped."""
+        return np.max(self.in_values) + 1
+
+    def __array__(self, dtype=None):
+        """Return an array that behaves like the arraymap when indexed.
+        
+        This array can be very large: it is the size of the largest value
+        in the ``in_vals`` array, plus one.
+        """
+        if dtype is None:
+            dtype = self.out_values.dtype
+        output = np.zeros(np.max(self.in_values) + 1, dtype=dtype)
+        output[self.in_values] = self.out_values
+        return output
+
+    @property
+    def dtype(self):
+        return self.out_values.dtype
+
+    def __repr__(self):
+        return f'ArrayMap({repr(self.in_values)}, {repr(self.out_values)})'
+
+    def __str__(self):
+        if len(self.in_values) <= self._max_str_lines + 1:
+            rows = range(len(self.in_values))
+            string = '\n'.join(
+                ['ArrayMap:'] +
+                [f'  {self.in_values[i]} → {self.out_values[i]}' for i in rows]
+            )
+        else:
+            rows0 = list(range(0, self._max_str_lines // 2))
+            rows1 = list(range(-self._max_str_lines // 2, 0))
+            string = '\n'.join(
+                ['ArrayMap:'] +
+                [f'  {self.in_values[i]} → {self.out_values[i]}'
+                 for i in rows0] +
+                ['  ...'] +
+                [f'  {self.in_values[i]} → {self.out_values[i]}'
+                 for i in rows1]
+            )
+        return string
+
+    def __call__(self, arr):
+        return self.__getitem__(arr)
+
+    def __getitem__(self, index):
+        scalar = np.isscalar(index)
+        if scalar:
+            index = np.array([index])
+        elif isinstance(index, slice):
+            start = index.start or 0  # treat None or 0 the same way
+            stop = (index.stop
+                    if index.stop is not None
+                    else len(self))
+            step = index.step
+            index = np.arange(start, stop, step)
+        if index.dtype == bool:
+            index = np.flatnonzero(index)
+
+        out = map_array(
+            index,
+            self.in_values.astype(index.dtype, copy=False),
+            self.out_values,
+        )
+
+        if scalar:
+            out = out[0]
+        return out
+
+    def __setitem__(self, indices, values):
+        if self._array is None:
+            self._array = self.__array__()
+        self._array[indices] = values
+        self.in_values = np.flatnonzero(self._array)
+        self.out_values = self._array[self.in_values]
diff --git a/skimage/segmentation/_remap.pyx b/skimage/segmentation/_remap.pyx
@@ -0,0 +1,24 @@
+# distutils: language = c++
+
+from libcpp.unordered_map cimport unordered_map
+cimport cython
+from .._shared.fused_numerics cimport np_numeric, np_anyint
+
+@cython.boundscheck(False)  # Deactivate bounds checking
+@cython.wraparound(False)   # Deactivate negative indexing
+def _map_array(np_anyint[:] inarr, np_numeric[:] outarr,
+               np_anyint[:] inval, np_numeric[:] outval):
+    # build the map from the input and output vectors
+    cdef size_t i, n_map, n_array
+    cdef unordered_map[np_anyint, np_numeric] lut
+    n_map = inval.shape[0]
+    for i in range(n_map):
+        lut[inval[i]] = outval[i]
+    # apply the map to the array
+    n_array = inarr.shape[0]
+    # The prange option gave some compilation warnings
+    #  "Unsigned index type not allowed before OpenMP 3.0"
+    # and didn't seem to be any faster
+    # for i in prange(n_array, nogil=True): #
+    for i in range(n_array):
+        outarr[i] = lut[inarr[i]]
diff --git a/skimage/segmentation/setup.py b/skimage/segmentation/setup.py
@@ -14,7 +14,8 @@ def configuration(parent_package='', top_path=None):
     cython(['_watershed_cy.pyx',
             '_felzenszwalb_cy.pyx',
             '_quickshift_cy.pyx',
-            '_slic.pyx'], working_path=base_path)
+            '_slic.pyx',
+            '_remap.pyx'], working_path=base_path)
     config.add_extension('_watershed_cy', sources=['_watershed_cy.c'],
                          include_dirs=[get_numpy_include_dirs()])
     config.add_extension('_felzenszwalb_cy', sources=['_felzenszwalb_cy.c'],
@@ -23,6 +24,12 @@ def configuration(parent_package='', top_path=None):
                          include_dirs=[get_numpy_include_dirs()])
     config.add_extension('_slic', sources=['_slic.c'],
                          include_dirs=[get_numpy_include_dirs()])
+    # note: the extra compiler flag -std=c++0x is needed to access the
+    # std::unordered_map container on some earlier gcc compilers. See:
+    # https://stackoverflow.com/a/3973692/224254
+    config.add_extension('_remap', sources='_remap.cpp',
+                         include_dirs=[get_numpy_include_dirs()],
+                         language='c++', extra_compile_args=['-std=c++0x'])
 
     return config