Skip to content

Commit

Permalink
handle user input on get_docid_for_value_range
Browse files Browse the repository at this point in the history
  • Loading branch information
PSeitz committed Jan 13, 2023
1 parent 509adab commit 079f542
Show file tree
Hide file tree
Showing 8 changed files with 240 additions and 33 deletions.
32 changes: 18 additions & 14 deletions fastfield_codecs/src/column.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use std::fmt::{self, Debug};
use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive};

Expand All @@ -6,7 +7,7 @@ use tantivy_bitpacker::minmax;
use crate::monotonic_mapping::StrictlyMonotonicFn;

/// `Column` provides columnar access on a field.
pub trait Column<T: PartialOrd = u64>: Send + Sync {
pub trait Column<T: PartialOrd + Debug = u64>: Send + Sync {
/// Return the value associated with the given idx.
///
/// This accessor should return as fast as possible.
Expand Down Expand Up @@ -83,7 +84,7 @@ pub struct VecColumn<'a, T = u64> {
max_value: T,
}

impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
impl<'a, C: Column<T>, T: Copy + PartialOrd + fmt::Debug> Column<T> for &'a C {
fn get_val(&self, idx: u32) -> T {
(*self).get_val(idx)
}
Expand All @@ -109,7 +110,7 @@ impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
}
}

impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> {
impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> Column<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T {
self.values[position as usize]
}
Expand Down Expand Up @@ -177,8 +178,8 @@ pub fn monotonic_map_column<C, T, Input, Output>(
where
C: Column<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Sync + Clone,
Output: PartialOrd + Send + Sync + Clone,
Input: PartialOrd + Send + Sync + Copy + Debug,
Output: PartialOrd + Send + Sync + Copy + Debug,
{
MonotonicMappingColumn {
from_column,
Expand All @@ -191,8 +192,8 @@ impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
where
C: Column<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Sync + Clone,
Output: PartialOrd + Send + Sync + Clone,
Input: PartialOrd + Send + Sync + Copy + Debug,
Output: PartialOrd + Send + Sync + Copy + Debug,
{
#[inline]
fn get_val(&self, idx: u32) -> Output {
Expand Down Expand Up @@ -228,12 +229,15 @@ where
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
self.from_column.get_docids_for_value_range(
self.monotonic_mapping.inverse(range.start().clone())
..=self.monotonic_mapping.inverse(range.end().clone()),
doc_id_range,
positions,
)
if range.start() > &self.max_value() || range.end() < &self.min_value() {
return;
}
let range = self.monotonic_mapping.inverse_coerce(range);
if range.start() > range.end() {
return;
}
self.from_column
.get_docids_for_value_range(range, doc_id_range, positions)
}

// We voluntarily do not implement get_range as it yields a regression,
Expand All @@ -254,7 +258,7 @@ where T: Iterator + Clone + ExactSizeIterator
impl<T> Column<T::Item> for IterColumn<T>
where
T: Iterator + Clone + ExactSizeIterator + Send + Sync,
T::Item: PartialOrd,
T::Item: PartialOrd + fmt::Debug,
{
fn get_val(&self, idx: u32) -> T::Item {
self.0.clone().nth(idx as usize).unwrap()
Expand Down
4 changes: 3 additions & 1 deletion fastfield_codecs/src/compact_space/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,8 @@ impl CompactSpaceDecompressor {
#[cfg(test)]
mod tests {

use std::fmt;

use super::*;
use crate::format_version::read_format_version;
use crate::null_index_footer::read_null_index_footer;
Expand Down Expand Up @@ -708,7 +710,7 @@ mod tests {
);
}

fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd>(
fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd + fmt::Debug>(
column: &C,
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,
Expand Down
16 changes: 12 additions & 4 deletions fastfield_codecs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ extern crate more_asserts;
#[cfg(all(test, feature = "unstable"))]
extern crate test;

use std::io;
use std::io::Write;
use std::sync::Arc;
use std::{fmt, io};

use common::BinarySerializable;
use compact_space::CompactSpaceDecompressor;
Expand Down Expand Up @@ -132,7 +132,7 @@ impl U128FastFieldCodecType {
}

/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u128<Item: MonotonicallyMappableToU128>(
pub fn open_u128<Item: MonotonicallyMappableToU128 + fmt::Debug>(
bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<Item>>> {
let (bytes, _format_version) = read_format_version(bytes)?;
Expand All @@ -146,7 +146,9 @@ pub fn open_u128<Item: MonotonicallyMappableToU128>(
}

/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Arc<dyn Column<T>>> {
pub fn open<T: MonotonicallyMappableToU64 + fmt::Debug>(
bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<T>>> {
let (bytes, _format_version) = read_format_version(bytes)?;
let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
let header = Header::deserialize(&mut bytes)?;
Expand All @@ -159,7 +161,7 @@ pub fn open<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Arc<
}
}

fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64 + fmt::Debug>(
bytes: OwnedBytes,
header: &Header,
) -> io::Result<Arc<dyn Column<Item>>> {
Expand Down Expand Up @@ -320,13 +322,19 @@ mod tests {
pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
let mut data_and_names = vec![];

let data = vec![10];
data_and_names.push((data, "minimal test"));

let data = (10..=10_000_u64).collect::<Vec<_>>();
data_and_names.push((data, "simple monotonically increasing"));

data_and_names.push((
vec![5, 6, 7, 8, 9, 10, 99, 100],
"offset in linear interpol",
));

data_and_names.push((vec![3, 18446744073709551613, 5], "docid range regression"));

data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
data_and_names.push((vec![10], "single value"));

Expand Down
78 changes: 74 additions & 4 deletions fastfield_codecs/src/monotonic_mapping.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
use std::fmt;
use std::marker::PhantomData;
use std::ops::RangeInclusive;

use fastdivide::DividerU64;

use crate::MonotonicallyMappableToU128;

/// Monotonic maps a value to u64 value space.
/// Monotonic mapping enables `PartialOrd` on u64 space without conversion to original space.
pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync {
pub trait MonotonicallyMappableToU64:
'static + PartialOrd + Copy + Send + Sync + fmt::Debug
{
/// Converts a value to u64.
///
/// Internally all fast field values are encoded as u64.
Expand All @@ -29,11 +33,29 @@ pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync
/// mapping from their range to their domain. The `inverse` method is required when opening a codec,
/// so a value can be converted back to its original domain (e.g. ip address or f64) from its
/// internal representation.
pub trait StrictlyMonotonicFn<External, Internal> {
pub trait StrictlyMonotonicFn<External: Copy, Internal: Copy> {
/// Strictly monotonically maps the value from External to Internal.
fn mapping(&self, inp: External) -> Internal;
/// Inverse of `mapping`. Maps the value from Internal to External.
fn inverse(&self, out: Internal) -> External;

/// Maps a user provded value from External to Internal.
/// It may be necessary to coerce the value if it is outside the value space.
/// In that case it tries to find the next greater value in the value space.
///
/// Returns a bool to mark if a value was outside the value space and had to be coerced _up_.
/// With that information we can detect if two values in a range both map outside the same value
/// space.
///
/// coerce_up means the next valid upper value in the value space will be chosen if the value
/// has to be coerced.
fn mapping_coerce(&self, inp: RangeInclusive<External>) -> RangeInclusive<Internal> {
self.mapping(*inp.start())..=self.mapping(*inp.end())
}
/// Inverse of `mapping_coerce`.
fn inverse_coerce(&self, out: RangeInclusive<Internal>) -> RangeInclusive<External> {
self.inverse(*out.start())..=self.inverse(*out.end())
}
}

/// Inverts a strictly monotonic mapping from `StrictlyMonotonicFn<A, B>` to
Expand All @@ -54,7 +76,10 @@ impl<T> From<T> for StrictlyMonotonicMappingInverter<T> {
}

impl<From, To, T> StrictlyMonotonicFn<To, From> for StrictlyMonotonicMappingInverter<T>
where T: StrictlyMonotonicFn<From, To>
where
T: StrictlyMonotonicFn<From, To>,
From: Copy,
To: Copy,
{
fn mapping(&self, val: To) -> From {
self.orig_mapping.inverse(val)
Expand All @@ -63,6 +88,15 @@ where T: StrictlyMonotonicFn<From, To>
fn inverse(&self, val: From) -> To {
self.orig_mapping.mapping(val)
}

#[inline]
fn mapping_coerce(&self, inp: RangeInclusive<To>) -> RangeInclusive<From> {
self.orig_mapping.inverse_coerce(inp)
}
#[inline]
fn inverse_coerce(&self, out: RangeInclusive<From>) -> RangeInclusive<To> {
self.orig_mapping.mapping_coerce(out)
}
}

/// Applies the strictly monotonic mapping from `T` without any additional changes.
Expand Down Expand Up @@ -134,6 +168,31 @@ impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
fn inverse(&self, out: u64) -> External {
External::from_u64(self.min_value + out * self.gcd)
}

#[inline]
#[allow(clippy::reversed_empty_ranges)]
fn mapping_coerce(&self, inp: RangeInclusive<External>) -> RangeInclusive<u64> {
let end = External::to_u64(*inp.end());
if end < self.min_value || inp.end() < inp.start() {
return 1..=0;
}
let map_coerce = |mut inp, coerce_up| {
let inp_lower_bound = self.inverse(0);
if inp < inp_lower_bound {
inp = inp_lower_bound;
}
let val = External::to_u64(inp);
let need_coercion = coerce_up && (val - self.min_value) % self.gcd != 0;
let mut mapped_val = self.mapping(inp);
if need_coercion {
mapped_val += 1;
}
mapped_val
};
let start = map_coerce(*inp.start(), true);
let end = map_coerce(*inp.end(), false);
start..=end
}
}

/// Strictly monotonic mapping with a base value.
Expand All @@ -149,6 +208,17 @@ impl StrictlyMonotonicMappingToInternalBaseval {
impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
for StrictlyMonotonicMappingToInternalBaseval
{
#[inline]
#[allow(clippy::reversed_empty_ranges)]
fn mapping_coerce(&self, inp: RangeInclusive<External>) -> RangeInclusive<u64> {
if External::to_u64(*inp.end()) < self.min_value {
return 1..=0;
}
let start = self.mapping(External::to_u64(*inp.start()).max(self.min_value));
let end = self.mapping(External::to_u64(*inp.end()));
start..=end
}

fn mapping(&self, val: External) -> u64 {
External::to_u64(val) - self.min_value
}
Expand Down Expand Up @@ -224,7 +294,7 @@ mod tests {
test_round_trip::<_, _, u64>(&mapping, 100u64);
}

fn test_round_trip<T: StrictlyMonotonicFn<K, L>, K: std::fmt::Debug + Eq + Copy, L>(
fn test_round_trip<T: StrictlyMonotonicFn<K, L>, K: std::fmt::Debug + Eq + Copy, L: Copy>(
mapping: &T,
test_val: K,
) {
Expand Down
5 changes: 4 additions & 1 deletion fastfield_codecs/src/monotonic_mapping_u128.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
use std::fmt;
use std::net::Ipv6Addr;

/// Montonic maps a value to u128 value space
/// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space.
pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Send + Sync {
pub trait MonotonicallyMappableToU128:
'static + PartialOrd + Copy + Send + Sync + fmt::Debug
{
/// Converts a value to u128.
///
/// Internally all fast field values are encoded as u64.
Expand Down
8 changes: 4 additions & 4 deletions fastfield_codecs/src/serialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

use std::io;
use std::num::NonZeroU64;
use std::sync::Arc;
use std::{fmt, io};

use common::{BinarySerializable, VInt};
use log::warn;
Expand Down Expand Up @@ -168,7 +168,7 @@ impl BinarySerializable for Header {

/// Return estimated compression for given codec in the value range [0.0..1.0], where 1.0 means no
/// compression.
pub fn estimate<T: MonotonicallyMappableToU64>(
pub fn estimate<T: MonotonicallyMappableToU64 + fmt::Debug>(
typed_column: impl Column<T>,
codec_type: FastFieldCodecType,
) -> Option<f32> {
Expand Down Expand Up @@ -214,7 +214,7 @@ pub fn serialize_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
}

/// Serializes the column with the codec with the best estimate on the data.
pub fn serialize<T: MonotonicallyMappableToU64>(
pub fn serialize<T: MonotonicallyMappableToU64 + fmt::Debug>(
typed_column: impl Column<T>,
output: &mut impl io::Write,
codecs: &[FastFieldCodecType],
Expand Down Expand Up @@ -294,7 +294,7 @@ fn serialize_given_codec(
}

/// Helper function to serialize a column (autodetect from all codecs) and then open it
pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default>(
pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default + fmt::Debug>(
column: &[T],
) -> Arc<dyn Column<T>> {
let mut buffer = Vec::new();
Expand Down
Loading

0 comments on commit 079f542

Please sign in to comment.