Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug in linspace optimization of bin and hist #2923

Merged
merged 6 commits into from Jan 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/about/release-notes.rst
Expand Up @@ -35,6 +35,14 @@ Release Notes
and Jan-Lukas Wynen :sup:`a`
v22.11.1 (January 2023)
-----------------------

Bugfixes
~~~~~~~~

* Fix a bug in :py:func:`scipp.hist` and :py:func:`scipp.bin`, leading to assignment of records with very large coord values outside the bin boundaries to a bin `#2923 <https://github.com/scipp/scipp/pull/2923>`_.

v22.11.0 (November 2022)
------------------------

Expand Down
51 changes: 24 additions & 27 deletions lib/core/include/scipp/core/element/bin.h
Expand Up @@ -40,6 +40,8 @@ static constexpr auto update_indices_by_binning = overloaded{
update_indices_by_binning_arg<int32_t, int32_t, double>,
update_indices_by_binning_arg<int64_t, float, double>,
update_indices_by_binning_arg<int32_t, float, double>,
update_indices_by_binning_arg<int64_t, double, float>,
update_indices_by_binning_arg<int32_t, double, float>,
update_indices_by_binning_arg<int64_t, int32_t, int64_t>,
update_indices_by_binning_arg<int32_t, int32_t, int64_t>>,
[](units::Unit &indices, const units::Unit &coord,
Expand All @@ -51,31 +53,20 @@ static constexpr auto update_indices_by_binning = overloaded{
transform_flags::expect_no_variance_arg<2>};

// Special faster implementation for linear bins.
static constexpr auto update_indices_by_binning_linspace =
overloaded{update_indices_by_binning,
[](auto &index, const auto &x, const auto &edges) {
if (index == -1)
return;
const auto [offset, nbin, scale] =
core::linear_edge_params(edges);
using Index = std::decay_t<decltype(index)>;
Index bin = (x - offset) * scale;
bin = std::clamp(bin, Index(0), Index(nbin - 1));
index *= nbin;
if (x < edges[bin]) {
if (bin != 0 && x >= edges[bin - 1])
index += bin - 1;
else
index = -1;
} else if (x >= edges[bin + 1]) {
if (bin != nbin - 1)
index += bin + 1;
else
index = -1;
} else {
index += bin;
}
}};
static constexpr auto update_indices_by_binning_linspace = overloaded{
update_indices_by_binning,
[](auto &index, const auto &x, const auto &edges) {
if (index == -1)
return;
using Index = std::decay_t<decltype(index)>;
const auto params = core::linear_edge_params(edges);
if (const auto bin = get_bin<Index>(x, edges, params); bin < 0) {
index = -1;
} else {
index *= std::get<1>(params); // nbin
index += bin;
}
}};

static constexpr auto update_indices_by_binning_sorted_edges =
overloaded{update_indices_by_binning,
Expand Down Expand Up @@ -111,9 +102,9 @@ static constexpr auto groups_to_map = overloaded{
return index;
}};

template <class Index, class T>
template <class Index, class Coord, class Edges = Coord>
using update_indices_by_grouping_arg =
std::tuple<Index, T, std::unordered_map<T, Index>>;
std::tuple<Index, Coord, std::unordered_map<Edges, Index>>;

static constexpr auto update_indices_by_grouping = overloaded{
element::arg_list<update_indices_by_grouping_arg<int64_t, double>,
Expand All @@ -122,6 +113,12 @@ static constexpr auto update_indices_by_grouping = overloaded{
update_indices_by_grouping_arg<int32_t, float>,
update_indices_by_grouping_arg<int64_t, int64_t>,
update_indices_by_grouping_arg<int32_t, int64_t>,
// Given int32 target groups, select from int64. Note that
// we do not support the reverse for now, since the
// `groups.find(x)` below would then have to cast to a
// lower precision, i.e., we would need special handling.
update_indices_by_grouping_arg<int64_t, int64_t, int32_t>,
update_indices_by_grouping_arg<int32_t, int64_t, int32_t>,
update_indices_by_grouping_arg<int64_t, int32_t>,
update_indices_by_grouping_arg<int32_t, int32_t>,
update_indices_by_grouping_arg<int64_t, bool>,
Expand Down
15 changes: 4 additions & 11 deletions lib/core/include/scipp/core/element/histogram.h
Expand Up @@ -46,6 +46,7 @@ static constexpr auto histogram = overloaded{
histogram_detail::args<float, int32_t, float, int32_t>,
histogram_detail::args<double, double, double, double>,
histogram_detail::args<double, float, double, double>,
histogram_detail::args<double, double, double, float>,
histogram_detail::args<double, float, double, float>,
histogram_detail::args<double, double, float, double>,
histogram_detail::args<double, int64_t, double, int64_t>,
Expand All @@ -62,20 +63,12 @@ static constexpr auto histogram = overloaded{
// Special implementation for linear bins. Gives a 1x to 20x speedup
// for few and many events per histogram, respectively.
if (scipp::numeric::islinspace(edges)) {
const auto [offset, nbin, scale] = core::linear_edge_params(edges);
const auto params = core::linear_edge_params(edges);
for (scipp::index i = 0; i < scipp::size(events); ++i) {
const auto x = events[i];
scipp::index bin = (x - offset) * scale;
bin = std::clamp(bin, scipp::index(0), scipp::index(nbin - 1));
if (x < edges[bin]) {
if (bin != 0 && x >= edges[bin - 1])
iadd(data, bin - 1, weights, i);
} else if (x >= edges[bin + 1]) {
if (bin != nbin - 1)
iadd(data, bin + 1, weights, i);
} else {
if (const auto bin = get_bin<scipp::index>(x, edges, params);
bin >= 0)
iadd(data, bin, weights, i);
}
}
} else {
core::expect::histogram::sorted_edges(edges);
Expand Down
18 changes: 18 additions & 0 deletions lib/core/include/scipp/core/histogram.h
Expand Up @@ -27,4 +27,22 @@ template <class T> void sorted_edges(const T &edges) {
}
} // namespace expect::histogram

template <class Index, class T, class Edges, class Params>
Index get_bin(const T &x, const Edges &edges, const Params &params) {
// Explicitly check for x outside edges here as otherwise we may run into an
// integer overflow when converting the "bin" computation result to `Index`.
if (x < edges.front() || x >= edges.back())
return -1;
const auto [offset, nbin, scale] = params;
Index bin = (x - offset) * scale;
bin = std::clamp(bin, Index(0), Index(nbin - 1));
if (x < edges[bin]) {
return bin - 1;
} else if (x >= edges[bin + 1]) {
return bin + 1;
} else {
return bin;
}
}

} // namespace scipp::core
65 changes: 65 additions & 0 deletions tests/binning_test.py
Expand Up @@ -760,3 +760,68 @@ def test_make_binned_via_bin_optimized_path_yields_equivalent_results(params):
expected = expected.bin(sizes)
expected = expected.bin(binning).hist()
assert sc.identical(result.hist(), expected)


def test_bin_linspace_handles_large_positive_values_correctly():
table = sc.data.table_xyz(10)
table.coords['x'].values[0] = 1e20
x = sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float64')
da = table.bin(x=x)
assert sc.identical(da, table[1:].bin(x=x))
assert da.bins.size().sum().value == 9


def test_bin_linspace_handles_large_negative_values_correctly():
table = sc.data.table_xyz(10)
table.coords['x'].values[0] = -1e20
x = sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float64')
da = table.bin(x=x)
assert sc.identical(da, table[1:].bin(x=x))
assert da.bins.size().sum().value == 9


def test_hist_linspace_handles_large_positive_values_correctly():
table = sc.data.table_xyz(10)
table.values[...] = 1.0
table.coords['x'].values[0] = 1e20
x = sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float64')
da = table.hist(x=x)
assert sc.identical(da, table[1:].hist(x=x))
assert da.sum().value == 9


def test_hist_linspace_handles_large_negative_values_correctly():
table = sc.data.table_xyz(10)
table.values[...] = 1.0
table.coords['x'].values[0] = -1e20
x = sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float64')
da = table.hist(x=x)
assert sc.identical(da, table[1:].hist(x=x))
assert da.sum().value == 9


def test_group_with_explicit_lower_precision_drops_rows_outside_domain():
table = sc.data.table_xyz(100)
table.coords['label'] = (table.coords['x'] * 10).to(dtype='int64')
table.coords['label'].values[0] = 0
da = table.group(sc.arange('label', 5, unit='m', dtype='int32'))
size0 = da.bins.size()['label', 0].value
size = da.bins.size().sum().value
table.coords['label'].values[0] = np.iinfo(np.int32).max + 100
da = table.group(sc.arange('label', 5, unit='m', dtype='int32'))
reference = table[1:].group(sc.arange('label', 5, unit='m', dtype='int32'))
assert sc.identical(da, reference)
assert da.bins.size()['label', 0].value == size0 - 1
assert da.bins.size().sum().value == size - 1


def test_bin_with_explicit_lower_precision_drops_rows_outside_domain():
table = sc.data.table_xyz(100)
x = sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float32')
da = table.bin(x=x)
size = da.bins.size().sum().value
table.coords['x'].values[0] = 2.0 * np.finfo(np.float32).max
da = table.bin(x=x)
reference = table[1:].bin(x=x)
assert sc.identical(da, reference)
assert da.bins.size().sum().value == size - 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None of these tests check that the correct element was dropped, only that the size was reduced by 1. Instead, you could slice out the bad element and group/bin/hist the slice and use that as the expected result.