From fedfa4bb8ef787074a64fb8f0554e2235260162a Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Sat, 27 Apr 2024 13:22:27 +0200 Subject: [PATCH] Add k_smallest_relaxed and variants This implements the algorithm described in [1] which consumes twice the amount of memory as the existing `k_smallest` algorithm but achieves linear time in the number of elements in the input. [1] https://quickwit.io/blog/top-k-complexity --- src/k_smallest.rs | 39 ++++++++++ src/lib.rs | 187 ++++++++++++++++++++++++++++++++++++++++++++++ tests/test_std.rs | 46 +++++++++++- 3 files changed, 270 insertions(+), 2 deletions(-) diff --git a/src/k_smallest.rs b/src/k_smallest.rs index 7b2f62ea1..083c1b00d 100644 --- a/src/k_smallest.rs +++ b/src/k_smallest.rs @@ -88,6 +88,45 @@ where storage } +pub(crate) fn k_smallest_relaxed_general(iter: I, k: usize, mut comparator: F) -> Vec +where + I: Iterator, + F: FnMut(&I::Item, &I::Item) -> Ordering, +{ + if k == 0 { + iter.last(); + return Vec::new(); + } + + let mut iter = iter.fuse(); + let mut buf = iter.by_ref().take(2 * k).collect::>(); + + if buf.len() < k { + buf.sort_unstable_by(&mut comparator); + return buf; + } + + buf.select_nth_unstable_by(k - 1, &mut comparator); + buf.truncate(k); + + iter.for_each(|val| { + if comparator(&val, &buf[k - 1]) != Ordering::Less { + return; + } + + buf.push(val); + + if buf.len() == 2 * k { + buf.select_nth_unstable_by(k - 1, &mut comparator); + buf.truncate(k); + } + }); + + buf.sort_unstable_by(&mut comparator); + buf.truncate(k); + buf +} + #[inline] pub(crate) fn key_to_cmp(mut key: F) -> impl FnMut(&T, &T) -> Ordering where diff --git a/src/lib.rs b/src/lib.rs index ff14cf94e..cfbcd9f79 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3060,6 +3060,105 @@ pub trait Itertools: Iterator { self.k_smallest_by(k, k_smallest::key_to_cmp(key)) } + /// Sort the k smallest elements into a new iterator, in ascending order, relaxing the amount of memory required. + /// + /// **Note:** This consumes the entire iterator, and returns the result + /// as a new iterator that owns its elements. If the input contains + /// less than k elements, the result is equivalent to `self.sorted()`. + /// + /// This is guaranteed to use `2 * k * sizeof(Self::Item) + O(1)` memory + /// and `O(n + k log k)` time, with `n` the number of elements in the input, + /// meaning it uses more memory than the minimum obtained by [`k_smallest`](Itertools::k_smallest) + /// but achieves linear time in the number of elements. + /// + /// The sorted iterator, if directly collected to a `Vec`, is converted + /// without any extra copying or allocation cost. + /// + /// **Note:** This is functionally-equivalent to `self.sorted().take(k)` + /// but much more efficient. + /// + /// ``` + /// use itertools::Itertools; + /// + /// // A random permutation of 0..15 + /// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5]; + /// + /// let five_smallest = numbers + /// .into_iter() + /// .k_smallest_relaxed(5); + /// + /// itertools::assert_equal(five_smallest, 0..5); + /// ``` + #[cfg(feature = "use_alloc")] + fn k_smallest_relaxed(self, k: usize) -> VecIntoIter + where + Self: Sized, + Self::Item: Ord, + { + self.k_smallest_relaxed_by(k, Ord::cmp) + } + + /// Sort the k smallest elements into a new iterator using the provided comparison, relaxing the amount of memory required. + /// + /// The sorted iterator, if directly collected to a `Vec`, is converted + /// without any extra copying or allocation cost. + /// + /// This corresponds to `self.sorted_by(cmp).take(k)` in the same way that + /// [`k_smallest_relaxed`](Itertools::k_smallest_relaxed) corresponds to `self.sorted().take(k)`, + /// in both semantics and complexity. + /// + /// ``` + /// use itertools::Itertools; + /// + /// // A random permutation of 0..15 + /// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5]; + /// + /// let five_smallest = numbers + /// .into_iter() + /// .k_smallest_relaxed_by(5, |a, b| (a % 7).cmp(&(b % 7)).then(a.cmp(b))); + /// + /// itertools::assert_equal(five_smallest, vec![0, 7, 14, 1, 8]); + /// ``` + #[cfg(feature = "use_alloc")] + fn k_smallest_relaxed_by(self, k: usize, cmp: F) -> VecIntoIter + where + Self: Sized, + F: FnMut(&Self::Item, &Self::Item) -> Ordering, + { + k_smallest::k_smallest_relaxed_general(self, k, cmp).into_iter() + } + + /// Return the elements producing the k smallest outputs of the provided function, relaxing the amount of memory required. + /// + /// The sorted iterator, if directly collected to a `Vec`, is converted + /// without any extra copying or allocation cost. + /// + /// This corresponds to `self.sorted_by_key(key).take(k)` in the same way that + /// [`k_smallest_relaxed`](Itertools::k_smallest_relaxed) corresponds to `self.sorted().take(k)`, + /// in both semantics and complexity. + /// + /// ``` + /// use itertools::Itertools; + /// + /// // A random permutation of 0..15 + /// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5]; + /// + /// let five_smallest = numbers + /// .into_iter() + /// .k_smallest_by_key(5, |n| (n % 7, *n)); + /// + /// itertools::assert_equal(five_smallest, vec![0, 7, 14, 1, 8]); + /// ``` + #[cfg(feature = "use_alloc")] + fn k_smallest_relaxed_by_key(self, k: usize, key: F) -> VecIntoIter + where + Self: Sized, + F: FnMut(&Self::Item) -> K, + K: Ord, + { + self.k_smallest_relaxed_by(k, k_smallest::key_to_cmp(key)) + } + /// Sort the k largest elements into a new iterator, in descending order. /// /// The sorted iterator, if directly collected to a `Vec`, is converted @@ -3150,6 +3249,94 @@ pub trait Itertools: Iterator { self.k_largest_by(k, k_smallest::key_to_cmp(key)) } + /// Sort the k largest elements into a new iterator, in descending order, relaxing the amount of memory required. + /// + /// The sorted iterator, if directly collected to a `Vec`, is converted + /// without any extra copying or allocation cost. + /// + /// It is semantically equivalent to [`k_smallest_relaxed`](Itertools::k_smallest_relaxed) + /// with a reversed `Ord`. + /// + /// ``` + /// use itertools::Itertools; + /// + /// // A random permutation of 0..15 + /// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5]; + /// + /// let five_largest = numbers + /// .into_iter() + /// .k_largest_relaxed(5); + /// + /// itertools::assert_equal(five_largest, vec![14, 13, 12, 11, 10]); + /// ``` + #[cfg(feature = "use_alloc")] + fn k_largest_relaxed(self, k: usize) -> VecIntoIter + where + Self: Sized, + Self::Item: Ord, + { + self.k_largest_relaxed_by(k, Self::Item::cmp) + } + + /// Sort the k largest elements into a new iterator using the provided comparison, relaxing the amount of memory required. + /// + /// The sorted iterator, if directly collected to a `Vec`, is converted + /// without any extra copying or allocation cost. + /// + /// Functionally equivalent to [`k_smallest_relaxed_by`](Itertools::k_smallest_relaxed_by) + /// with a reversed `Ord`. + /// + /// ``` + /// use itertools::Itertools; + /// + /// // A random permutation of 0..15 + /// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5]; + /// + /// let five_largest = numbers + /// .into_iter() + /// .k_largest_relaxed_by(5, |a, b| (a % 7).cmp(&(b % 7)).then(a.cmp(b))); + /// + /// itertools::assert_equal(five_largest, vec![13, 6, 12, 5, 11]); + /// ``` + #[cfg(feature = "use_alloc")] + fn k_largest_relaxed_by(self, k: usize, mut cmp: F) -> VecIntoIter + where + Self: Sized, + F: FnMut(&Self::Item, &Self::Item) -> Ordering, + { + self.k_smallest_relaxed_by(k, move |a, b| cmp(b, a)) + } + + /// Return the elements producing the k largest outputs of the provided function, relaxing the amount of memory required. + /// + /// The sorted iterator, if directly collected to a `Vec`, is converted + /// without any extra copying or allocation cost. + /// + /// Functionally equivalent to [`k_smallest_relaxed_by_key`](Itertools::k_smallest_relaxed_by_key) + /// with a reversed `Ord`. + /// + /// ``` + /// use itertools::Itertools; + /// + /// // A random permutation of 0..15 + /// let numbers = vec![6, 9, 1, 14, 0, 4, 8, 7, 11, 2, 10, 3, 13, 12, 5]; + /// + /// let five_largest = numbers + /// .into_iter() + /// .k_largest_relaxed_by_key(5, |n| (n % 7, *n)); + /// + /// itertools::assert_equal(five_largest, vec![13, 6, 12, 5, 11]); + /// ``` + #[cfg(feature = "use_alloc")] + fn k_largest_relaxed_by_key(self, k: usize, key: F) -> VecIntoIter + where + Self: Sized, + F: FnMut(&Self::Item) -> K, + K: Ord, + { + self.k_largest_relaxed_by(k, k_smallest::key_to_cmp(key)) + } + /// Consumes the iterator and return an iterator of the last `n` elements. /// /// The iterator, if directly collected to a `VecDeque`, is converted diff --git a/tests/test_std.rs b/tests/test_std.rs index 00246d506..e759c0f92 100644 --- a/tests/test_std.rs +++ b/tests/test_std.rs @@ -527,6 +527,42 @@ qc::quickcheck! { it::assert_equal(largest_by, sorted_largest.clone()); it::assert_equal(largest_by_key, sorted_largest); } + + fn k_smallest_relaxed_range(n: i64, m: u16, k: u16) -> () { + // u16 is used to constrain k and m to 0..2¹⁶, + // otherwise the test could use too much memory. + let (k, m) = (k as usize, m as u64); + + let mut v: Vec<_> = (n..n.saturating_add(m as _)).collect(); + // Generate a random permutation of n..n+m + v.shuffle(&mut thread_rng()); + + // Construct the right answers for the top and bottom elements + let mut sorted = v.clone(); + sorted.sort(); + // how many elements are we checking + let num_elements = min(k, m as _); + + // Compute the top and bottom k in various combinations + let sorted_smallest = sorted[..num_elements].iter().cloned(); + let smallest = v.iter().cloned().k_smallest_relaxed(k); + let smallest_by = v.iter().cloned().k_smallest_relaxed_by(k, Ord::cmp); + let smallest_by_key = v.iter().cloned().k_smallest_relaxed_by_key(k, |&x| x); + + let sorted_largest = sorted[sorted.len() - num_elements..].iter().rev().cloned(); + let largest = v.iter().cloned().k_largest_relaxed(k); + let largest_by = v.iter().cloned().k_largest_relaxed_by(k, Ord::cmp); + let largest_by_key = v.iter().cloned().k_largest_relaxed_by_key(k, |&x| x); + + // Check the variations produce the same answers and that they're right + it::assert_equal(smallest, sorted_smallest.clone()); + it::assert_equal(smallest_by, sorted_smallest.clone()); + it::assert_equal(smallest_by_key, sorted_smallest); + + it::assert_equal(largest, sorted_largest.clone()); + it::assert_equal(largest_by, sorted_largest.clone()); + it::assert_equal(largest_by_key, sorted_largest); + } } #[derive(Clone, Debug)] @@ -571,8 +607,11 @@ where I::Item: Ord + Debug, { let j = i.clone(); + let i1 = i.clone(); + let j1 = i.clone(); let k = k as usize; - it::assert_equal(i.k_smallest(k), j.sorted().take(k)) + it::assert_equal(i.k_smallest(k), j.sorted().take(k)); + it::assert_equal(i1.k_smallest_relaxed(k), j1.sorted().take(k)); } // Similar to `k_smallest_sort` but for our custom heap implementation. @@ -582,8 +621,11 @@ where I::Item: Ord + Debug, { let j = i.clone(); + let i1 = i.clone(); + let j1 = i.clone(); let k = k as usize; - it::assert_equal(i.k_smallest_by(k, Ord::cmp), j.sorted().take(k)) + it::assert_equal(i.k_smallest_by(k, Ord::cmp), j.sorted().take(k)); + it::assert_equal(i1.k_smallest_relaxed_by(k, Ord::cmp), j1.sorted().take(k)); } macro_rules! generic_test {