From 9d8feb8cc9d5e9458a97532585d35fd1c78370fa Mon Sep 17 00:00:00 2001 From: Jonas Rembser Date: Sun, 31 May 2026 15:30:23 +0200 Subject: [PATCH] [ml] Fix out-of-bounds read in filtered cluster split In RClusterLoader::LoadTrainingClusterInto the boundary entry was read from rdfEntries unconditionally before checking valCount, so when one side of the split is empty the access was past end-of-vector. This was silent in non-hardened builds but trips libstdc++ assertions (e.g. with libcxxhardeningfast), aborting test09_filtered_last_chunk. Compute the boundary only when the corresponding rdfEntries index is in-bounds, falling back to the cluster endpoint otherwise. (cherry picked from commit 6183bad2b3acd9c9a34f0af921ad8374af4337e1) --- tree/ml/inc/ROOT/ML/RClusterLoader.hxx | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tree/ml/inc/ROOT/ML/RClusterLoader.hxx b/tree/ml/inc/ROOT/ML/RClusterLoader.hxx index 090b097923c1a..b2abf47a183c2 100644 --- a/tree/ml/inc/ROOT/ML/RClusterLoader.hxx +++ b/tree/ml/inc/ROOT/ML/RClusterLoader.hxx @@ -410,10 +410,20 @@ public: trainIsPrefix = coin(g); } - // The boundary is the raw entry index of the first entry assigned to validation. - // Stable across epochs since the same filter always produces the same ordered entries. - const std::uint64_t trainBoundaryEntry = trainIsPrefix ? rdfEntries[trainCount] : rdfEntries[valCount]; - const std::uint64_t boundary = (valCount > 0) ? trainBoundaryEntry : endRow; + // The boundary is the raw entry index that splits train and val sub-ranges within the + // cluster. Stable across epochs since the same filter always produces the same ordered + // entries. When one side has no filtered entries we fall back to the cluster endpoint that + // collapses that side to an empty range, avoiding an out-of-bounds access into rdfEntries + // (whose size is totalFiltered, so rdfEntries[totalFiltered] is OOB and trips libstdc++ + // hardened-mode assertions). + std::uint64_t boundary; + if (trainIsPrefix) { + // train = [startRow, boundary), val = [boundary, endRow) + boundary = (trainCount < totalFiltered) ? rdfEntries[trainCount] : endRow; + } else { + // train = [boundary, endRow), val = [startRow, boundary) + boundary = (valCount < totalFiltered) ? rdfEntries[valCount] : endRow; + } const std::uint64_t trainStart = trainIsPrefix ? startRow : boundary; const std::uint64_t trainEnd = trainIsPrefix ? boundary : endRow;