diff --git a/src/ess/reduce/__init__.py b/src/ess/reduce/__init__.py index 28b55d7c..79986007 100644 --- a/src/ess/reduce/__init__.py +++ b/src/ess/reduce/__init__.py @@ -3,7 +3,7 @@ import importlib.metadata -from . import nexus, uncertainty +from . import masking, nexus, uncertainty try: __version__ = importlib.metadata.version(__package__ or __name__) @@ -12,4 +12,4 @@ del importlib -__all__ = ['nexus', 'uncertainty'] +__all__ = ['masking', 'nexus', 'uncertainty'] diff --git a/src/ess/reduce/masking.py b/src/ess/reduce/masking.py new file mode 100644 index 00000000..ba8232ea --- /dev/null +++ b/src/ess/reduce/masking.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2024 Scipp contributors (https://github.com/scipp) + +import uuid +from typing import Optional + +import numpy as np +import scipp as sc + + +def mask_range( + da: sc.DataArray, mask: Optional[sc.DataArray], name: Optional[str] = None +) -> sc.DataArray: + """ + Mask a range on a data array. + The provided edges are used to define the ranges to be masked. + + Parameters + ---------- + da: + The data array to be masked. + mask: + A data array defining the mask to be applied. Only one-dimensional masks are + supported. The data array should contain a bin-edge coordinate which represents + the edges of the ranges to be masked. The values of the data array represent the + mask values (``True`` or ``False``) inside each range defined by the coordinate. + name: + The name of the mask to be applied. If not provided, a random name will be used. + + Returns + ------- + : + A copy of the input data array with the mask applied. + """ + if mask is None: + return da + if name is None: + name = uuid.uuid4().hex + if name in da.masks: + raise ValueError( + f'Mask {name} already exists in data array and would be overwritten.' + ) + dim = mask.dim + edges = mask.coords[dim] + if not mask.coords.is_edges(dim): + raise sc.DimensionError( + f'Coordinate {dim} must be bin-edges to mask a range, found midpoints.' + ) + if (dim in da.coords) and (da.coords[dim].ndim > 1): + raise sc.DimensionError( + 'Cannot mask range on data with multi-dimensional coordinate. ' + f'Found dimensions {da.coords[dim].dims} for coordinate {dim}.' + ) + + coord = ( + da.bins.constituents['data'].coords[dim] + if da.bins is not None + else da.coords[dim] + ) + edges = edges.to(unit=coord.unit) + lu = sc.DataArray(data=mask.data, coords={dim: edges}) + if da.bins is not None: + if dim not in da.coords: + underlying = da.bins.coords[dim] + new_bins = np.union1d( + edges.values, + np.array( + [ + underlying.min().value, + np.nextafter(underlying.max().value, np.inf), + ] + ), + ) + else: + new_bins = np.union1d(edges.values, da.coords[dim].values) + new_bins = sc.array(dims=[dim], values=new_bins, unit=edges.unit) + out = da.bin({dim: new_bins}) + out.masks[name] = sc.lookup(lu, dim)[sc.midpoints(new_bins, dim=dim)] + else: + out = da.copy(deep=False) + mask_values = sc.lookup(lu, dim)[da.coords[dim]] + if da.coords.is_edges(dim): + out.masks[name] = mask_values[dim, 1:] | mask_values[dim, :-1] + else: + out.masks[name] = mask_values + return out diff --git a/tests/masking_test.py b/tests/masking_test.py new file mode 100644 index 00000000..187cef6f --- /dev/null +++ b/tests/masking_test.py @@ -0,0 +1,194 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2023 Scipp contributors (https://github.com/scipp) + +import numpy as np +import pytest +import scipp as sc + +from ess.reduce.masking import mask_range + + +def test_mask_range_dense_data(): + x = sc.arange('x', 5.0, unit='m') + da = sc.DataArray(data=x, coords={'x': x}) + mask = sc.DataArray( + data=sc.array(dims=['x'], values=[True]), + coords={ + 'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'), + }, + ) + masked = mask_range(da, mask=mask, name='mymask') + assert masked.masks['mymask'].values.tolist() == [False, False, True, True, False] + + +def test_mask_range_dense_data_bin_edges(): + x = sc.arange('x', 6.0, unit='m') + da = sc.DataArray(data=x[:-1], coords={'x': x}) + mask = sc.DataArray( + data=sc.array(dims=['x'], values=[True]), + coords={ + 'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'), + }, + ) + masked = mask_range(da, mask=mask, name='mymask') + assert masked.masks['mymask'].values.tolist() == [False, True, True, True, False] + + +def test_mask_range_dense_data_two_ranges(): + x = sc.arange('x', 11.0, unit='m') + da = sc.DataArray(data=x[:-1], coords={'x': x}) + mask = sc.DataArray( + data=sc.array(dims=['x'], values=[True, False, True]), + coords={ + 'x': sc.array(dims=['x'], values=[1.9, 3.001, 6.5, 8.8], unit='m'), + }, + ) + masked = mask_range(da, mask=mask, name='mymask') + assert masked.masks['mymask'].values.tolist() == [ + False, + True, + True, + True, + False, + False, + True, + True, + True, + False, + ] + + +def test_mask_range_binned_data_no_prior_binning(): + x = sc.arange('x', 5.0, unit='m') + da = sc.DataArray(data=x, coords={'x': x, 'y': x + x.max()}) + binned = da.bin(y=2) + mask = sc.DataArray( + data=sc.array(dims=['x'], values=[True]), + coords={ + 'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'), + }, + ) + masked = mask_range(binned, mask=mask, name='mymask') + # This should make 3 bins that span the entire data range, and where the middle + # one is masked. + assert masked.masks['mymask'].values.tolist() == [False, True, False] + binned_full_range = binned.bin(x=1) + assert sc.identical(masked.coords['x'][0], binned_full_range.coords['x'][0]) + assert sc.identical(masked.coords['x'][1:3], mask.coords['x']) + assert sc.identical(masked.coords['x'][3], binned_full_range.coords['x'][1]) + + +def test_mask_range_binned_data_has_prior_single_bin(): + x = sc.arange('x', 5.0, unit='m') + da = sc.DataArray(data=x, coords={'x': x, 'y': x + x.max()}) + binned = da.bin(x=1) + mask = sc.DataArray( + data=sc.array(dims=['x'], values=[True]), + coords={ + 'x': sc.array(dims=['x'], values=[1.5, 3.001], unit='m'), + }, + ) + masked = mask_range(binned, mask=mask, name='mymask') + assert masked.masks['mymask'].values.tolist() == [False, True, False] + assert np.allclose(masked.coords['x'].values, [0.0, 1.5, 3.001, 4.0]) + + +def test_mask_range_binned_data_has_prior_multiple_bins(): + x = sc.arange('x', 5.0, unit='m') + da = sc.DataArray(data=x, coords={'x': x, 'y': x + x.max()}) + binned = da.bin(x=2) + mask = sc.DataArray( + data=sc.array(dims=['x'], values=[True]), + coords={ + 'x': sc.array(dims=['x'], values=[1.5, 3.001], unit='m'), + }, + ) + masked = mask_range(binned, mask=mask, name='mymask') + assert masked.masks['mymask'].values.tolist() == [False, True, True, False] + assert np.allclose(masked.coords['x'].values, [0.0, 1.5, 2.0, 3.001, 4.0]) + + +def test_mask_range_binned_data_has_already_same_edge_as_mask(): + x = sc.arange('x', 5.0, unit='m') + da = sc.DataArray(data=x, coords={'x': x, 'y': x + x.max()}) + binned = da.bin(x=2) + mask = sc.DataArray( + data=sc.array(dims=['x'], values=[True]), + coords={ + 'x': sc.array( + dims=['x'], values=[binned.coords['x'][1].value, 3.001], unit='m' + ), + }, + ) + masked = mask_range(binned, mask=mask, name='mymask') + assert masked.masks['mymask'].values.tolist() == [False, True, False] + assert np.allclose(masked.coords['x'].values, [0.0, 2.0, 3.001, 4.0]) + + +def test_mask_range_with_midpoints_coord_raises(): + x = sc.arange('x', 5.0, unit='m') + da = sc.DataArray(data=x, coords={'x': x}) + mask = sc.DataArray( + data=sc.array(dims=['x'], values=[True, False]), + coords={ + 'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'), + }, + ) + with pytest.raises( + sc.DimensionError, match='Coordinate x must be bin-edges to mask a range' + ): + _ = mask_range(da, mask=mask, name='mymask') + + +def test_mask_range_on_dense_data_with_two_dimensional_coord_raises(): + xy = sc.arange('x', 20.0, unit='m').fold(dim='x', sizes={'y': 4, 'x': 5}) + da = sc.DataArray(data=xy, coords={'x': xy}) + mask = sc.DataArray( + data=sc.array(dims=['x'], values=[True]), + coords={ + 'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'), + }, + ) + with pytest.raises( + sc.DimensionError, + match='Cannot mask range on data with multi-dimensional coordinate', + ): + _ = mask_range(da, mask=mask, name='mymask') + + +def test_mask_range_on_binned_data_with_two_dimensional_coord_raises(): + x = sc.arange('x', 10.0, unit='m') + da = sc.DataArray(data=x, coords={'x': x, 'y': x * 1.5}) + binned = da.bin(x=2, y=2) + binned.coords['x'] = sc.broadcast(binned.coords['x'], sizes={'x': 3, 'y': 2}) + mask = sc.DataArray( + data=sc.array(dims=['x'], values=[True]), + coords={ + 'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'), + }, + ) + with pytest.raises( + sc.DimensionError, + match='Cannot mask range on data with multi-dimensional coordinate', + ): + _ = mask_range(binned, mask=mask, name='mymask') + + +def test_mask_range_on_data_with_existing_mask_of_same_name_raises(): + x = sc.arange('x', 10.0, unit='m') + da = sc.DataArray( + data=x, + coords={'x': x, 'y': x * 1.5}, + masks={'mymask': x > sc.scalar(5.0, unit='m')}, + ) + mask = sc.DataArray( + data=sc.array(dims=['x'], values=[True]), + coords={ + 'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'), + }, + ) + with pytest.raises( + ValueError, + match='Mask mymask already exists in data array and would be overwritten', + ): + _ = mask_range(da, mask=mask, name='mymask')