scipp · SimonHeybrock · Jan 5, 2023 · Jan 4, 2023 · Jan 4, 2023 · Jan 4, 2023
diff --git a/docs/about/release-notes.rst b/docs/about/release-notes.rst
@@ -35,6 +35,14 @@ Release Notes
    and Jan-Lukas Wynen :sup:`a`
 
 
+v22.11.1 (January 2023)
+-----------------------
+
+Bugfixes
+~~~~~~~~
+
+* Fix a bug in :py:func:`scipp.hist` and :py:func:`scipp.bin`, leading to assignment of records with very large coord values outside the bin boundaries to a bin `#2923 <https://github.com/scipp/scipp/pull/2923>`_.
+
 v22.11.0 (November 2022)
 ------------------------
 

diff --git a/lib/core/include/scipp/core/element/bin.h b/lib/core/include/scipp/core/element/bin.h
@@ -40,6 +40,8 @@ static constexpr auto update_indices_by_binning = overloaded{
                       update_indices_by_binning_arg<int32_t, int32_t, double>,
                       update_indices_by_binning_arg<int64_t, float, double>,
                       update_indices_by_binning_arg<int32_t, float, double>,
+                      update_indices_by_binning_arg<int64_t, double, float>,
+                      update_indices_by_binning_arg<int32_t, double, float>,
                       update_indices_by_binning_arg<int64_t, int32_t, int64_t>,
                       update_indices_by_binning_arg<int32_t, int32_t, int64_t>>,
     [](units::Unit &indices, const units::Unit &coord,
@@ -51,31 +53,20 @@ static constexpr auto update_indices_by_binning = overloaded{
     transform_flags::expect_no_variance_arg<2>};
 
 // Special faster implementation for linear bins.
-static constexpr auto update_indices_by_binning_linspace =
-    overloaded{update_indices_by_binning,
-               [](auto &index, const auto &x, const auto &edges) {
-                 if (index == -1)
-                   return;
-                 const auto [offset, nbin, scale] =
-                     core::linear_edge_params(edges);
-                 using Index = std::decay_t<decltype(index)>;
-                 Index bin = (x - offset) * scale;
-                 bin = std::clamp(bin, Index(0), Index(nbin - 1));
-                 index *= nbin;
-                 if (x < edges[bin]) {
-                   if (bin != 0 && x >= edges[bin - 1])
-                     index += bin - 1;
-                   else
-                     index = -1;
-                 } else if (x >= edges[bin + 1]) {
-                   if (bin != nbin - 1)
-                     index += bin + 1;
-                   else
-                     index = -1;
-                 } else {
-                   index += bin;
-                 }
-               }};
+static constexpr auto update_indices_by_binning_linspace = overloaded{
+    update_indices_by_binning,
+    [](auto &index, const auto &x, const auto &edges) {
+      if (index == -1)
+        return;
+      using Index = std::decay_t<decltype(index)>;
+      const auto params = core::linear_edge_params(edges);
+      if (const auto bin = get_bin<Index>(x, edges, params); bin < 0) {
+        index = -1;
+      } else {
+        index *= std::get<1>(params); // nbin
+        index += bin;
+      }
+    }};
 
 static constexpr auto update_indices_by_binning_sorted_edges =
     overloaded{update_indices_by_binning,
@@ -111,9 +102,9 @@ static constexpr auto groups_to_map = overloaded{
       return index;
     }};
 
-template <class Index, class T>
+template <class Index, class Coord, class Edges = Coord>
 using update_indices_by_grouping_arg =
-    std::tuple<Index, T, std::unordered_map<T, Index>>;
+    std::tuple<Index, Coord, std::unordered_map<Edges, Index>>;
 
 static constexpr auto update_indices_by_grouping = overloaded{
     element::arg_list<update_indices_by_grouping_arg<int64_t, double>,
@@ -122,6 +113,12 @@ static constexpr auto update_indices_by_grouping = overloaded{
                       update_indices_by_grouping_arg<int32_t, float>,
                       update_indices_by_grouping_arg<int64_t, int64_t>,
                       update_indices_by_grouping_arg<int32_t, int64_t>,
+                      // Given int32 target groups, select from int64. Note that
+                      // we do not support the reverse for now, since the
+                      // `groups.find(x)` below would then have to cast to a
+                      // lower precision, i.e., we would need special handling.
+                      update_indices_by_grouping_arg<int64_t, int64_t, int32_t>,
+                      update_indices_by_grouping_arg<int32_t, int64_t, int32_t>,
                       update_indices_by_grouping_arg<int64_t, int32_t>,
                       update_indices_by_grouping_arg<int32_t, int32_t>,
                       update_indices_by_grouping_arg<int64_t, bool>,

diff --git a/lib/core/include/scipp/core/element/histogram.h b/lib/core/include/scipp/core/element/histogram.h
@@ -46,6 +46,7 @@ static constexpr auto histogram = overloaded{
         histogram_detail::args<float, int32_t, float, int32_t>,
         histogram_detail::args<double, double, double, double>,
         histogram_detail::args<double, float, double, double>,
+        histogram_detail::args<double, double, double, float>,
         histogram_detail::args<double, float, double, float>,
         histogram_detail::args<double, double, float, double>,
         histogram_detail::args<double, int64_t, double, int64_t>,
@@ -62,20 +63,12 @@ static constexpr auto histogram = overloaded{
       // Special implementation for linear bins. Gives a 1x to 20x speedup
       // for few and many events per histogram, respectively.
       if (scipp::numeric::islinspace(edges)) {
-        const auto [offset, nbin, scale] = core::linear_edge_params(edges);
+        const auto params = core::linear_edge_params(edges);
         for (scipp::index i = 0; i < scipp::size(events); ++i) {
           const auto x = events[i];
-          scipp::index bin = (x - offset) * scale;
-          bin = std::clamp(bin, scipp::index(0), scipp::index(nbin - 1));
-          if (x < edges[bin]) {
-            if (bin != 0 && x >= edges[bin - 1])
-              iadd(data, bin - 1, weights, i);
-          } else if (x >= edges[bin + 1]) {
-            if (bin != nbin - 1)
-              iadd(data, bin + 1, weights, i);
-          } else {
+          if (const auto bin = get_bin<scipp::index>(x, edges, params);
+              bin >= 0)
             iadd(data, bin, weights, i);
-          }
         }
       } else {
         core::expect::histogram::sorted_edges(edges);

diff --git a/lib/core/include/scipp/core/histogram.h b/lib/core/include/scipp/core/histogram.h
@@ -27,4 +27,22 @@ template <class T> void sorted_edges(const T &edges) {
 }
 } // namespace expect::histogram
 
+template <class Index, class T, class Edges, class Params>
+Index get_bin(const T &x, const Edges &edges, const Params &params) {
+  // Explicitly check for x outside edges here as otherwise we may run into an
+  // integer overflow when converting the "bin" computation result to `Index`.
+  if (x < edges.front() || x >= edges.back())
+    return -1;
+  const auto [offset, nbin, scale] = params;
+  Index bin = (x - offset) * scale;
+  bin = std::clamp(bin, Index(0), Index(nbin - 1));
+  if (x < edges[bin]) {
+    return bin - 1;
+  } else if (x >= edges[bin + 1]) {
+    return bin + 1;
+  } else {
+    return bin;
+  }
+}
+
 } // namespace scipp::core
diff --git a/tests/binning_test.py b/tests/binning_test.py
@@ -760,3 +760,68 @@ def test_make_binned_via_bin_optimized_path_yields_equivalent_results(params):
         expected = expected.bin(sizes)
     expected = expected.bin(binning).hist()
     assert sc.identical(result.hist(), expected)
+
+
+def test_bin_linspace_handles_large_positive_values_correctly():
+    table = sc.data.table_xyz(10)
+    table.coords['x'].values[0] = 1e20
+    x = sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float64')
+    da = table.bin(x=x)
+    assert sc.identical(da, table[1:].bin(x=x))
+    assert da.bins.size().sum().value == 9
+
+
+def test_bin_linspace_handles_large_negative_values_correctly():
+    table = sc.data.table_xyz(10)
+    table.coords['x'].values[0] = -1e20
+    x = sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float64')
+    da = table.bin(x=x)
+    assert sc.identical(da, table[1:].bin(x=x))
+    assert da.bins.size().sum().value == 9
+
+
+def test_hist_linspace_handles_large_positive_values_correctly():
+    table = sc.data.table_xyz(10)
+    table.values[...] = 1.0
+    table.coords['x'].values[0] = 1e20
+    x = sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float64')
+    da = table.hist(x=x)
+    assert sc.identical(da, table[1:].hist(x=x))
+    assert da.sum().value == 9
+
+
+def test_hist_linspace_handles_large_negative_values_correctly():
+    table = sc.data.table_xyz(10)
+    table.values[...] = 1.0
+    table.coords['x'].values[0] = -1e20
+    x = sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float64')
+    da = table.hist(x=x)
+    assert sc.identical(da, table[1:].hist(x=x))
+    assert da.sum().value == 9
+
+
+def test_group_with_explicit_lower_precision_drops_rows_outside_domain():
+    table = sc.data.table_xyz(100)
+    table.coords['label'] = (table.coords['x'] * 10).to(dtype='int64')
+    table.coords['label'].values[0] = 0
+    da = table.group(sc.arange('label', 5, unit='m', dtype='int32'))
+    size0 = da.bins.size()['label', 0].value
+    size = da.bins.size().sum().value
+    table.coords['label'].values[0] = np.iinfo(np.int32).max + 100
+    da = table.group(sc.arange('label', 5, unit='m', dtype='int32'))
+    reference = table[1:].group(sc.arange('label', 5, unit='m', dtype='int32'))
+    assert sc.identical(da, reference)
+    assert da.bins.size()['label', 0].value == size0 - 1
+    assert da.bins.size().sum().value == size - 1
+
+
+def test_bin_with_explicit_lower_precision_drops_rows_outside_domain():
+    table = sc.data.table_xyz(100)
+    x = sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float32')
+    da = table.bin(x=x)
+    size = da.bins.size().sum().value
+    table.coords['x'].values[0] = 2.0 * np.finfo(np.float32).max
+    da = table.bin(x=x)
+    reference = table[1:].bin(x=x)
+    assert sc.identical(da, reference)
+    assert da.bins.size().sum().value == size - 1