Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/ess/reduce/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import importlib.metadata

from . import nexus, uncertainty
from . import masking, nexus, uncertainty

try:
__version__ = importlib.metadata.version(__package__ or __name__)
Expand All @@ -12,4 +12,4 @@

del importlib

__all__ = ['nexus', 'uncertainty']
__all__ = ['masking', 'nexus', 'uncertainty']
86 changes: 86 additions & 0 deletions src/ess/reduce/masking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2024 Scipp contributors (https://github.com/scipp)

import uuid
from typing import Optional

import numpy as np
import scipp as sc


def mask_range(
da: sc.DataArray, mask: Optional[sc.DataArray], name: Optional[str] = None
) -> sc.DataArray:
"""
Mask a range on a data array.
The provided edges are used to define the ranges to be masked.

Parameters
----------
da:
The data array to be masked.
mask:
A data array defining the mask to be applied. Only one-dimensional masks are
supported. The data array should contain a bin-edge coordinate which represents
the edges of the ranges to be masked. The values of the data array represent the
mask values (``True`` or ``False``) inside each range defined by the coordinate.
name:
The name of the mask to be applied. If not provided, a random name will be used.

Returns
-------
:
A copy of the input data array with the mask applied.
"""
if mask is None:
return da
if name is None:
name = uuid.uuid4().hex
if name in da.masks:
raise ValueError(
f'Mask {name} already exists in data array and would be overwritten.'
)
dim = mask.dim
edges = mask.coords[dim]
if not mask.coords.is_edges(dim):
raise sc.DimensionError(
f'Coordinate {dim} must be bin-edges to mask a range, found midpoints.'
)
if (dim in da.coords) and (da.coords[dim].ndim > 1):
raise sc.DimensionError(
'Cannot mask range on data with multi-dimensional coordinate. '
f'Found dimensions {da.coords[dim].dims} for coordinate {dim}.'
)

coord = (
da.bins.constituents['data'].coords[dim]
if da.bins is not None
else da.coords[dim]
)
edges = edges.to(unit=coord.unit)
lu = sc.DataArray(data=mask.data, coords={dim: edges})
if da.bins is not None:
if dim not in da.coords:
underlying = da.bins.coords[dim]
new_bins = np.union1d(
edges.values,
np.array(
[
underlying.min().value,
np.nextafter(underlying.max().value, np.inf),
]
),
)
else:
new_bins = np.union1d(edges.values, da.coords[dim].values)
new_bins = sc.array(dims=[dim], values=new_bins, unit=edges.unit)
out = da.bin({dim: new_bins})
out.masks[name] = sc.lookup(lu, dim)[sc.midpoints(new_bins, dim=dim)]
else:
out = da.copy(deep=False)
mask_values = sc.lookup(lu, dim)[da.coords[dim]]
if da.coords.is_edges(dim):
out.masks[name] = mask_values[dim, 1:] | mask_values[dim, :-1]
else:
out.masks[name] = mask_values
return out
194 changes: 194 additions & 0 deletions tests/masking_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
# SPDX-License-Identifier: BSD-3-Clause
# Copyright (c) 2023 Scipp contributors (https://github.com/scipp)

import numpy as np
import pytest
import scipp as sc

from ess.reduce.masking import mask_range


def test_mask_range_dense_data():
x = sc.arange('x', 5.0, unit='m')
da = sc.DataArray(data=x, coords={'x': x})
mask = sc.DataArray(
data=sc.array(dims=['x'], values=[True]),
coords={
'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'),
},
)
masked = mask_range(da, mask=mask, name='mymask')
assert masked.masks['mymask'].values.tolist() == [False, False, True, True, False]


def test_mask_range_dense_data_bin_edges():
x = sc.arange('x', 6.0, unit='m')
da = sc.DataArray(data=x[:-1], coords={'x': x})
mask = sc.DataArray(
data=sc.array(dims=['x'], values=[True]),
coords={
'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'),
},
)
masked = mask_range(da, mask=mask, name='mymask')
assert masked.masks['mymask'].values.tolist() == [False, True, True, True, False]


def test_mask_range_dense_data_two_ranges():
x = sc.arange('x', 11.0, unit='m')
da = sc.DataArray(data=x[:-1], coords={'x': x})
mask = sc.DataArray(
data=sc.array(dims=['x'], values=[True, False, True]),
coords={
'x': sc.array(dims=['x'], values=[1.9, 3.001, 6.5, 8.8], unit='m'),
},
)
masked = mask_range(da, mask=mask, name='mymask')
assert masked.masks['mymask'].values.tolist() == [
False,
True,
True,
True,
False,
False,
True,
True,
True,
False,
]


def test_mask_range_binned_data_no_prior_binning():
x = sc.arange('x', 5.0, unit='m')
da = sc.DataArray(data=x, coords={'x': x, 'y': x + x.max()})
binned = da.bin(y=2)
mask = sc.DataArray(
data=sc.array(dims=['x'], values=[True]),
coords={
'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'),
},
)
masked = mask_range(binned, mask=mask, name='mymask')
# This should make 3 bins that span the entire data range, and where the middle
# one is masked.
assert masked.masks['mymask'].values.tolist() == [False, True, False]
binned_full_range = binned.bin(x=1)
assert sc.identical(masked.coords['x'][0], binned_full_range.coords['x'][0])
assert sc.identical(masked.coords['x'][1:3], mask.coords['x'])
assert sc.identical(masked.coords['x'][3], binned_full_range.coords['x'][1])


def test_mask_range_binned_data_has_prior_single_bin():
x = sc.arange('x', 5.0, unit='m')
da = sc.DataArray(data=x, coords={'x': x, 'y': x + x.max()})
binned = da.bin(x=1)
mask = sc.DataArray(
data=sc.array(dims=['x'], values=[True]),
coords={
'x': sc.array(dims=['x'], values=[1.5, 3.001], unit='m'),
},
)
masked = mask_range(binned, mask=mask, name='mymask')
assert masked.masks['mymask'].values.tolist() == [False, True, False]
assert np.allclose(masked.coords['x'].values, [0.0, 1.5, 3.001, 4.0])


def test_mask_range_binned_data_has_prior_multiple_bins():
x = sc.arange('x', 5.0, unit='m')
da = sc.DataArray(data=x, coords={'x': x, 'y': x + x.max()})
binned = da.bin(x=2)
mask = sc.DataArray(
data=sc.array(dims=['x'], values=[True]),
coords={
'x': sc.array(dims=['x'], values=[1.5, 3.001], unit='m'),
},
)
masked = mask_range(binned, mask=mask, name='mymask')
assert masked.masks['mymask'].values.tolist() == [False, True, True, False]
assert np.allclose(masked.coords['x'].values, [0.0, 1.5, 2.0, 3.001, 4.0])


def test_mask_range_binned_data_has_already_same_edge_as_mask():
x = sc.arange('x', 5.0, unit='m')
da = sc.DataArray(data=x, coords={'x': x, 'y': x + x.max()})
binned = da.bin(x=2)
mask = sc.DataArray(
data=sc.array(dims=['x'], values=[True]),
coords={
'x': sc.array(
dims=['x'], values=[binned.coords['x'][1].value, 3.001], unit='m'
),
},
)
masked = mask_range(binned, mask=mask, name='mymask')
assert masked.masks['mymask'].values.tolist() == [False, True, False]
assert np.allclose(masked.coords['x'].values, [0.0, 2.0, 3.001, 4.0])


def test_mask_range_with_midpoints_coord_raises():
x = sc.arange('x', 5.0, unit='m')
da = sc.DataArray(data=x, coords={'x': x})
mask = sc.DataArray(
data=sc.array(dims=['x'], values=[True, False]),
coords={
'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'),
},
)
with pytest.raises(
sc.DimensionError, match='Coordinate x must be bin-edges to mask a range'
):
_ = mask_range(da, mask=mask, name='mymask')


def test_mask_range_on_dense_data_with_two_dimensional_coord_raises():
xy = sc.arange('x', 20.0, unit='m').fold(dim='x', sizes={'y': 4, 'x': 5})
da = sc.DataArray(data=xy, coords={'x': xy})
mask = sc.DataArray(
data=sc.array(dims=['x'], values=[True]),
coords={
'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'),
},
)
with pytest.raises(
sc.DimensionError,
match='Cannot mask range on data with multi-dimensional coordinate',
):
_ = mask_range(da, mask=mask, name='mymask')


def test_mask_range_on_binned_data_with_two_dimensional_coord_raises():
x = sc.arange('x', 10.0, unit='m')
da = sc.DataArray(data=x, coords={'x': x, 'y': x * 1.5})
binned = da.bin(x=2, y=2)
binned.coords['x'] = sc.broadcast(binned.coords['x'], sizes={'x': 3, 'y': 2})
mask = sc.DataArray(
data=sc.array(dims=['x'], values=[True]),
coords={
'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'),
},
)
with pytest.raises(
sc.DimensionError,
match='Cannot mask range on data with multi-dimensional coordinate',
):
_ = mask_range(binned, mask=mask, name='mymask')


def test_mask_range_on_data_with_existing_mask_of_same_name_raises():
x = sc.arange('x', 10.0, unit='m')
da = sc.DataArray(
data=x,
coords={'x': x, 'y': x * 1.5},
masks={'mymask': x > sc.scalar(5.0, unit='m')},
)
mask = sc.DataArray(
data=sc.array(dims=['x'], values=[True]),
coords={
'x': sc.array(dims=['x'], values=[1.9, 3.001], unit='m'),
},
)
with pytest.raises(
ValueError,
match='Mask mymask already exists in data array and would be overwritten',
):
_ = mask_range(da, mask=mask, name='mymask')