scipp · SimonHeybrock · Jan 5, 2023 · Jan 4, 2023 · Jan 4, 2023 · Jan 4, 2023
diff --git a/docs/about/release-notes.rst b/docs/about/release-notes.rst
@@ -35,6 +35,14 @@ Release Notes
    and Jan-Lukas Wynen :sup:`a`
 
 
+v22.11.1 (January 2023)
+-----------------------
+
+Bugfixes
+~~~~~~~~
+
+* Fix a bug in :py:func:`scipp.hist` and :py:func:`scipp.bin`, leading to assignment of records with very large coord values outside the bin boundaries to a bin `#2923 <https://github.com/scipp/scipp/pull/2923>`_.
+
 v22.11.0 (November 2022)
 ------------------------
 

diff --git a/lib/core/include/scipp/core/element/bin.h b/lib/core/include/scipp/core/element/bin.h
@@ -40,6 +40,8 @@ static constexpr auto update_indices_by_binning = overloaded{
                       update_indices_by_binning_arg<int32_t, int32_t, double>,
                       update_indices_by_binning_arg<int64_t, float, double>,
                       update_indices_by_binning_arg<int32_t, float, double>,
+                      update_indices_by_binning_arg<int64_t, double, float>,
+                      update_indices_by_binning_arg<int32_t, double, float>,
                       update_indices_by_binning_arg<int64_t, int32_t, int64_t>,
                       update_indices_by_binning_arg<int32_t, int32_t, int64_t>>,
     [](units::Unit &indices, const units::Unit &coord,
@@ -56,22 +58,23 @@ static constexpr auto update_indices_by_binning_linspace =
                [](auto &index, const auto &x, const auto &edges) {
                  if (index == -1)
                    return;
+                 // Explicitly check for x outside edges here as otherwise we
+                 // may run into an integer overflow when converting the "bin"
+                 // computation result to `Index`.
+                 if (x < edges.front() || x >= edges.back()) {
+                   index = -1;
+                   return;
+                 }
                  const auto [offset, nbin, scale] =
                      core::linear_edge_params(edges);
                  using Index = std::decay_t<decltype(index)>;
                  Index bin = (x - offset) * scale;
                  bin = std::clamp(bin, Index(0), Index(nbin - 1));
                  index *= nbin;
                  if (x < edges[bin]) {
-                   if (bin != 0 && x >= edges[bin - 1])
-                     index += bin - 1;
-                   else
-                     index = -1;
+                   index += bin - 1;
                  } else if (x >= edges[bin + 1]) {
-                   if (bin != nbin - 1)
-                     index += bin + 1;
-                   else
-                     index = -1;
+                   index += bin + 1;
                  } else {
                    index += bin;
                  }
@@ -111,9 +114,9 @@ static constexpr auto groups_to_map = overloaded{
       return index;
     }};
 
-template <class Index, class T>
+template <class Index, class Coord, class Edges = Coord>
 using update_indices_by_grouping_arg =
-    std::tuple<Index, T, std::unordered_map<T, Index>>;
+    std::tuple<Index, Coord, std::unordered_map<Edges, Index>>;
 
 static constexpr auto update_indices_by_grouping = overloaded{
     element::arg_list<update_indices_by_grouping_arg<int64_t, double>,
@@ -122,6 +125,12 @@ static constexpr auto update_indices_by_grouping = overloaded{
                       update_indices_by_grouping_arg<int32_t, float>,
                       update_indices_by_grouping_arg<int64_t, int64_t>,
                       update_indices_by_grouping_arg<int32_t, int64_t>,
+                      // Given int32 target groups, select from int64. Note that
+                      // we do not support the reverse for now, since the
+                      // `groups.find(x)` below would then have to cast to a
+                      // lower precision, i.e., we would need special handling.
+                      update_indices_by_grouping_arg<int64_t, int64_t, int32_t>,
+                      update_indices_by_grouping_arg<int32_t, int64_t, int32_t>,
                       update_indices_by_grouping_arg<int64_t, int32_t>,
                       update_indices_by_grouping_arg<int32_t, int32_t>,
                       update_indices_by_grouping_arg<int64_t, bool>,

diff --git a/lib/core/include/scipp/core/element/histogram.h b/lib/core/include/scipp/core/element/histogram.h
@@ -46,6 +46,7 @@ static constexpr auto histogram = overloaded{
         histogram_detail::args<float, int32_t, float, int32_t>,
         histogram_detail::args<double, double, double, double>,
         histogram_detail::args<double, float, double, double>,
+        histogram_detail::args<double, double, double, float>,
         histogram_detail::args<double, float, double, float>,
         histogram_detail::args<double, double, float, double>,
         histogram_detail::args<double, int64_t, double, int64_t>,
@@ -65,14 +66,17 @@ static constexpr auto histogram = overloaded{
         const auto [offset, nbin, scale] = core::linear_edge_params(edges);
         for (scipp::index i = 0; i < scipp::size(events); ++i) {
           const auto x = events[i];
+          // Explicitly check for x outside edges here as otherwise we
+          // may run into an integer overflow when converting the "bin"
+          // computation result to `Index`.
+          if (x < edges.front() || x >= edges.back())
+            continue;
           scipp::index bin = (x - offset) * scale;
           bin = std::clamp(bin, scipp::index(0), scipp::index(nbin - 1));
           if (x < edges[bin]) {
-            if (bin != 0 && x >= edges[bin - 1])
-              iadd(data, bin - 1, weights, i);
+            iadd(data, bin - 1, weights, i);
           } else if (x >= edges[bin + 1]) {
-            if (bin != nbin - 1)
-              iadd(data, bin + 1, weights, i);
+            iadd(data, bin + 1, weights, i);
           } else {
             iadd(data, bin, weights, i);
           }

diff --git a/tests/binning_test.py b/tests/binning_test.py
@@ -760,3 +760,56 @@ def test_make_binned_via_bin_optimized_path_yields_equivalent_results(params):
         expected = expected.bin(sizes)
     expected = expected.bin(binning).hist()
     assert sc.identical(result.hist(), expected)
+
+
+def test_bin_linspace_handles_large_positive_values_correctly():
+    table = sc.data.table_xyz(10)
+    table.coords['x'].values[0] = 1e16
+    da = table.bin(x=sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float64'))
+    assert da.bins.size().sum().value == 9
+
+
+def test_bin_linspace_handles_large_negative_values_correctly():
+    table = sc.data.table_xyz(10)
+    table.coords['x'].values[0] = -1e16
+    da = table.bin(x=sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float64'))
+    assert da.bins.size().sum().value == 9
+
+
+def test_hist_linspace_handles_large_positive_values_correctly():
+    table = sc.data.table_xyz(10)
+    table.values[...] = 1.0
+    table.coords['x'].values[0] = 1e20
+    da = table.hist(x=sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float64'))
+    assert da.sum().value == 9
+
+
+def test_hist_linspace_handles_large_negative_values_correctly():
+    table = sc.data.table_xyz(10)
+    table.values[...] = 1.0
+    table.coords['x'].values[0] = -1e20
+    da = table.hist(x=sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float64'))
+    assert da.sum().value == 9
+
+
+def test_group_with_explicit_lower_precision_drops_rows_outside_domain():
+    table = sc.data.table_xyz(100)
+    table.coords['label'] = (table.coords['x'] * 10).to(dtype='int64')
+    table.coords['label'].values[0] = 0
+    da = table.group(sc.arange('label', 5, unit='m', dtype='int32'))
+    size0 = da.bins.size()['label', 0].value
+    size = da.bins.size().sum().value
+    table.coords['label'].values[0] = np.iinfo(np.int32).max + 100
+    da = table.group(sc.arange('label', 5, unit='m', dtype='int32'))
+    assert da.bins.size()['label', 0].value == size0 - 1
+    assert da.bins.size().sum().value == size - 1
+
+
+def test_bin_with_explicit_lower_precision_drops_rows_outside_domain():
+    table = sc.data.table_xyz(100)
+    x = sc.linspace('x', 0.0, 1.0, 3, unit='m', dtype='float32')
+    da = table.bin(x=x)
+    size = da.bins.size().sum().value
+    table.coords['x'].values[0] = 2.0 * np.finfo(np.float32).max
+    da = table.bin(x=x)
+    assert da.bins.size().sum().value == size - 1