Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use optional index in multivalued index #2439

Merged
merged 3 commits into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 6 additions & 67 deletions columnar/benches/bench_access.rs
Original file line number Diff line number Diff line change
@@ -1,73 +1,13 @@
use core::fmt;
use std::fmt::{Display, Formatter};

use binggan::{black_box, InputGroup};
use tantivy_columnar::*;
use common::*;
use tantivy_columnar::Column;

pub enum Card {
MultiSparse,
Multi,
Sparse,
Dense,
Full,
}
impl Display for Card {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Card::MultiSparse => write!(f, "multi sparse 1/13"),
Card::Multi => write!(f, "multi 2x"),
Card::Sparse => write!(f, "sparse 1/13"),
Card::Dense => write!(f, "dense 1/12"),
Card::Full => write!(f, "full"),
}
}
}
pub mod common;

const NUM_DOCS: u32 = 2_000_000;

pub fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader {
use tantivy_columnar::ColumnarWriter;

let mut columnar_writer = ColumnarWriter::default();

match card {
Card::MultiSparse => {
columnar_writer.record_numerical(0, "price", 10u64);
columnar_writer.record_numerical(0, "price", 10u64);
}
_ => {}
}

for i in 0..num_docs {
match card {
Card::MultiSparse | Card::Sparse => {
if i % 13 == 0 {
columnar_writer.record_numerical(i, "price", i as u64);
}
}
Card::Dense => {
if i % 12 == 0 {
columnar_writer.record_numerical(i, "price", i as u64);
}
}
Card::Full => {
columnar_writer.record_numerical(i, "price", i as u64);
}
Card::Multi => {
columnar_writer.record_numerical(i, "price", i as u64);
columnar_writer.record_numerical(i, "price", i as u64);
}
}
}

let mut wrt: Vec<u8> = Vec::new();
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
let reader = ColumnarReader::open(wrt).unwrap();
reader
}

pub fn generate_columnar_and_open(card: Card, num_docs: u32) -> Column {
let reader = generate_columnar(card, num_docs);
let reader = generate_columnar_with_name(card, num_docs, "price");
reader.read_columns("price").unwrap()[0]
.open_u64_lenient()
.unwrap()
Expand Down Expand Up @@ -116,9 +56,8 @@ fn bench_group(mut runner: InputGroup<Column>) {

column.first_vals(&docs, &mut buffer);
for val in buffer.iter() {
if let Some(val) = val {
sum += *val;
}
let Some(val) = val else { continue };
sum += *val;
}
}

Expand Down
8 changes: 4 additions & 4 deletions columnar/benches/bench_merge.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
mod bench_access;
pub mod common;

use bench_access::{generate_columnar, Card};
use binggan::{black_box, BenchRunner};
use common::{generate_columnar_with_name, Card};
use tantivy_columnar::*;

const NUM_DOCS: u32 = 100_000;
Expand All @@ -13,8 +13,8 @@ fn main() {
inputs.push((
format!("merge_{card1}_and_{card2}"),
vec![
generate_columnar(card1, NUM_DOCS),
generate_columnar(card2, NUM_DOCS),
generate_columnar_with_name(card1, NUM_DOCS, "price"),
generate_columnar_with_name(card2, NUM_DOCS, "price"),
],
));
};
Expand Down
59 changes: 59 additions & 0 deletions columnar/benches/common.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
extern crate tantivy_columnar;

use core::fmt;
use std::fmt::{Display, Formatter};

use tantivy_columnar::{ColumnarReader, ColumnarWriter};

pub enum Card {
MultiSparse,
Multi,
Sparse,
Dense,
Full,
}
impl Display for Card {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Card::MultiSparse => write!(f, "multi sparse 1/13"),
Card::Multi => write!(f, "multi 2x"),
Card::Sparse => write!(f, "sparse 1/13"),
Card::Dense => write!(f, "dense 1/12"),
Card::Full => write!(f, "full"),
}
}
}
pub fn generate_columnar_with_name(card: Card, num_docs: u32, column_name: &str) -> ColumnarReader {
let mut columnar_writer = ColumnarWriter::default();

if let Card::MultiSparse = card {
columnar_writer.record_numerical(0, column_name, 10u64);
columnar_writer.record_numerical(0, column_name, 10u64);
}

for i in 0..num_docs {
match card {
Card::MultiSparse | Card::Sparse => {
if i % 13 == 0 {
columnar_writer.record_numerical(i, column_name, i as u64);
}
}
Card::Dense => {
if i % 12 == 0 {
columnar_writer.record_numerical(i, column_name, i as u64);
}
}
Card::Full => {
columnar_writer.record_numerical(i, column_name, i as u64);
}
Card::Multi => {
columnar_writer.record_numerical(i, column_name, i as u64);
columnar_writer.record_numerical(i, column_name, i as u64);
}
}
}

let mut wrt: Vec<u8> = Vec::new();
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
ColumnarReader::open(wrt).unwrap()
}
Binary file modified columnar/compat_tests_data/v1.columnar
Binary file not shown.
Binary file added columnar/compat_tests_data/v2.columnar
Binary file not shown.
2 changes: 1 addition & 1 deletion columnar/src/column/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
.map(|value_row_id: RowId| self.values.get_val(value_row_id))
}

/// Get the docids of values which are in the provided value range.
/// Get the docids of values which are in the provided value and docid range.
#[inline]
pub fn get_docids_for_value_range(
&self,
Expand Down
27 changes: 17 additions & 10 deletions columnar/src/column/serialize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use crate::column_values::{
CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
};
use crate::iterable::Iterable;
use crate::StrColumn;
use crate::{StrColumn, Version};

pub fn serialize_column_mappable_to_u128<T: MonotonicallyMappableToU128>(
column_index: SerializableColumnIndex<'_>,
Expand Down Expand Up @@ -40,7 +40,10 @@ pub fn serialize_column_mappable_to_u64<T: MonotonicallyMappableToU64>(
Ok(())
}

pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Column<T>> {
pub fn open_column_u64<T: MonotonicallyMappableToU64>(
bytes: OwnedBytes,
format_version: Version,
) -> io::Result<Column<T>> {
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
let column_index_num_bytes = u32::from_le_bytes(
column_index_num_bytes_payload
Expand All @@ -49,7 +52,7 @@ pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::
.unwrap(),
);
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
let column_index = crate::column_index::open_column_index(column_index_data)?;
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
let column_values = load_u64_based_column_values(column_values_data)?;
Ok(Column {
index: column_index,
Expand All @@ -59,6 +62,7 @@ pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::

pub fn open_column_u128<T: MonotonicallyMappableToU128>(
bytes: OwnedBytes,
format_version: Version,
) -> io::Result<Column<T>> {
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
let column_index_num_bytes = u32::from_le_bytes(
Expand All @@ -68,7 +72,7 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
.unwrap(),
);
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
let column_index = crate::column_index::open_column_index(column_index_data)?;
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
let column_values = crate::column_values::open_u128_mapped(column_values_data)?;
Ok(Column {
index: column_index,
Expand All @@ -79,7 +83,10 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
/// Open the column as u64.
///
/// See [`open_u128_as_compact_u64`] for more details.
pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u64>> {
pub fn open_column_u128_as_compact_u64(
bytes: OwnedBytes,
format_version: Version,
) -> io::Result<Column<u64>> {
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
let column_index_num_bytes = u32::from_le_bytes(
column_index_num_bytes_payload
Expand All @@ -88,27 +95,27 @@ pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u
.unwrap(),
);
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
let column_index = crate::column_index::open_column_index(column_index_data)?;
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
let column_values = crate::column_values::open_u128_as_compact_u64(column_values_data)?;
Ok(Column {
index: column_index,
values: column_values,
})
}

pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> {
pub fn open_column_bytes(data: OwnedBytes, format_version: Version) -> io::Result<BytesColumn> {
let (body, dictionary_len_bytes) = data.rsplit(4);
let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());
let (dictionary_bytes, column_bytes) = body.split(dictionary_len as usize);
let dictionary = Arc::new(Dictionary::from_bytes(dictionary_bytes)?);
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes)?;
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes, format_version)?;
Ok(BytesColumn {
dictionary,
term_ord_column,
})
}

pub fn open_column_str(data: OwnedBytes) -> io::Result<StrColumn> {
let bytes_column = open_column_bytes(data)?;
pub fn open_column_str(data: OwnedBytes, format_version: Version) -> io::Result<StrColumn> {
let bytes_column = open_column_bytes(data, format_version)?;
Ok(StrColumn::wrap(bytes_column))
}
19 changes: 16 additions & 3 deletions columnar/src/column_index/merge/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,12 @@ pub fn merge_column_index<'a>(

#[cfg(test)]
mod tests {
use common::OwnedBytes;

use crate::column_index::merge::detect_cardinality;
use crate::column_index::multivalued_index::MultiValueIndex;
use crate::column_index::multivalued_index::{
open_multivalued_index, serialize_multivalued_index, MultiValueIndex,
};
use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
use crate::{
Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
Expand Down Expand Up @@ -171,7 +175,11 @@ mod tests {
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
let mut output = Vec::new();
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
let multivalue =
open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5]);
}

Expand Down Expand Up @@ -200,11 +208,16 @@ mod tests {
],
)
.into();

let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
let mut output = Vec::new();
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
let multivalue =
open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
}
}
Loading
Loading