From 72f61ff89c3655e255a0f6b7e3bba1ce9db6baa8 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Thu, 13 Jun 2024 16:51:53 +0900 Subject: [PATCH] remove index sorting (#2434) closes https://github.com/quickwit-oss/tantivy/issues/2352 --- columnar/benches/bench_access.rs | 2 +- columnar/benches/bench_first_vals.rs | 2 +- columnar/benches/bench_merge.rs | 2 +- .../src/column_index/optional_index/tests.rs | 4 +- columnar/src/columnar/merge/merge_mapping.rs | 1 + columnar/src/columnar/merge/tests.rs | 14 +- columnar/src/columnar/reader/mod.rs | 4 +- .../src/columnar/writer/column_writers.rs | 29 +- columnar/src/columnar/writer/mod.rs | 105 +--- columnar/src/tests.rs | 126 +--- doc/src/index_sorting.md | 5 + src/fastfield/mod.rs | 24 +- src/fastfield/writer.rs | 24 +- src/fieldnorm/mod.rs | 2 +- src/fieldnorm/writer.rs | 14 +- src/functional_test.rs | 67 +- src/index/index.rs | 28 +- src/index/index_meta.rs | 52 +- src/index/mod.rs | 2 +- src/indexer/doc_id_mapping.rs | 538 +--------------- src/indexer/index_writer.rs | 375 +----------- src/indexer/merge_index_test.rs | 147 +++++ src/indexer/merger.rs | 326 +--------- src/indexer/merger_sorted_index_test.rs | 579 ------------------ src/indexer/mod.rs | 2 +- src/indexer/segment_serializer.rs | 26 +- src/indexer/segment_updater.rs | 17 +- src/indexer/segment_writer.rs | 65 +- src/lib.rs | 4 +- src/postings/json_postings_writer.rs | 4 - src/postings/postings_writer.rs | 10 +- src/postings/recorder.rs | 66 +- 32 files changed, 293 insertions(+), 2373 deletions(-) create mode 100644 src/indexer/merge_index_test.rs delete mode 100644 src/indexer/merger_sorted_index_test.rs diff --git a/columnar/benches/bench_access.rs b/columnar/benches/bench_access.rs index b4ce5fe0cc..debeb5f7b4 100644 --- a/columnar/benches/bench_access.rs +++ b/columnar/benches/bench_access.rs @@ -61,7 +61,7 @@ fn generate_columnar(card: Card, num_docs: u32) -> Column { } let mut wrt: Vec = Vec::new(); - columnar_writer.serialize(num_docs, None, &mut wrt).unwrap(); + columnar_writer.serialize(num_docs, &mut wrt).unwrap(); let reader = ColumnarReader::open(wrt).unwrap(); reader.read_columns("price").unwrap()[0] diff --git a/columnar/benches/bench_first_vals.rs b/columnar/benches/bench_first_vals.rs index b7bc49dc7e..bfc9431762 100644 --- a/columnar/benches/bench_first_vals.rs +++ b/columnar/benches/bench_first_vals.rs @@ -31,7 +31,7 @@ fn get_test_columns() -> Columns { } let mut buffer: Vec = Vec::new(); dataframe_writer - .serialize(data.len() as u32, None, &mut buffer) + .serialize(data.len() as u32, &mut buffer) .unwrap(); let columnar = ColumnarReader::open(buffer).unwrap(); diff --git a/columnar/benches/bench_merge.rs b/columnar/benches/bench_merge.rs index 89586e504f..7f67bcba2b 100644 --- a/columnar/benches/bench_merge.rs +++ b/columnar/benches/bench_merge.rs @@ -50,7 +50,7 @@ fn generate_columnar(card: Card, num_docs: u32) -> ColumnarReader { } let mut wrt: Vec = Vec::new(); - columnar_writer.serialize(num_docs, None, &mut wrt).unwrap(); + columnar_writer.serialize(num_docs, &mut wrt).unwrap(); ColumnarReader::open(wrt).unwrap() } diff --git a/columnar/src/column_index/optional_index/tests.rs b/columnar/src/column_index/optional_index/tests.rs index d25f267c2e..e24b42194f 100644 --- a/columnar/src/column_index/optional_index/tests.rs +++ b/columnar/src/column_index/optional_index/tests.rs @@ -15,9 +15,7 @@ fn test_optional_index_with_num_docs(num_docs: u32) { let mut dataframe_writer = ColumnarWriter::default(); dataframe_writer.record_numerical(100, "score", 80i64); let mut buffer: Vec = Vec::new(); - dataframe_writer - .serialize(num_docs, None, &mut buffer) - .unwrap(); + dataframe_writer.serialize(num_docs, &mut buffer).unwrap(); let columnar = ColumnarReader::open(buffer).unwrap(); assert_eq!(columnar.num_columns(), 1); let cols: Vec = columnar.read_columns("score").unwrap(); diff --git a/columnar/src/columnar/merge/merge_mapping.rs b/columnar/src/columnar/merge/merge_mapping.rs index 8428861821..078ed44bb9 100644 --- a/columnar/src/columnar/merge/merge_mapping.rs +++ b/columnar/src/columnar/merge/merge_mapping.rs @@ -59,6 +59,7 @@ pub enum MergeRowOrder { Stack(StackMergeOrder), /// Some more complex mapping, that may interleaves rows from the different readers and /// drop rows, or do both. + /// TODO: remove ordering part here Shuffled(ShuffleMergeOrder), } diff --git a/columnar/src/columnar/merge/tests.rs b/columnar/src/columnar/merge/tests.rs index 32f29bccd7..697fe3d246 100644 --- a/columnar/src/columnar/merge/tests.rs +++ b/columnar/src/columnar/merge/tests.rs @@ -12,7 +12,7 @@ fn make_columnar + HasAssociatedColumnType + Copy>( } let mut buffer: Vec = Vec::new(); dataframe_writer - .serialize(vals.len() as RowId, None, &mut buffer) + .serialize(vals.len() as RowId, &mut buffer) .unwrap(); ColumnarReader::open(buffer).unwrap() } @@ -157,9 +157,7 @@ fn make_numerical_columnar_multiple_columns( .max() .unwrap_or(0u32); let mut buffer: Vec = Vec::new(); - dataframe_writer - .serialize(num_rows, None, &mut buffer) - .unwrap(); + dataframe_writer.serialize(num_rows, &mut buffer).unwrap(); ColumnarReader::open(buffer).unwrap() } @@ -182,9 +180,7 @@ fn make_byte_columnar_multiple_columns( } } let mut buffer: Vec = Vec::new(); - dataframe_writer - .serialize(num_rows, None, &mut buffer) - .unwrap(); + dataframe_writer.serialize(num_rows, &mut buffer).unwrap(); ColumnarReader::open(buffer).unwrap() } @@ -203,9 +199,7 @@ fn make_text_columnar_multiple_columns(columns: &[(&str, &[&[&str]])]) -> Column .max() .unwrap_or(0u32); let mut buffer: Vec = Vec::new(); - dataframe_writer - .serialize(num_rows, None, &mut buffer) - .unwrap(); + dataframe_writer.serialize(num_rows, &mut buffer).unwrap(); ColumnarReader::open(buffer).unwrap() } diff --git a/columnar/src/columnar/reader/mod.rs b/columnar/src/columnar/reader/mod.rs index 174cd36eec..23af3f0eea 100644 --- a/columnar/src/columnar/reader/mod.rs +++ b/columnar/src/columnar/reader/mod.rs @@ -195,7 +195,7 @@ mod tests { columnar_writer.record_column_type("col1", ColumnType::Str, false); columnar_writer.record_column_type("col2", ColumnType::U64, false); let mut buffer = Vec::new(); - columnar_writer.serialize(1, None, &mut buffer).unwrap(); + columnar_writer.serialize(1, &mut buffer).unwrap(); let columnar = ColumnarReader::open(buffer).unwrap(); let columns = columnar.list_columns().unwrap(); assert_eq!(columns.len(), 2); @@ -211,7 +211,7 @@ mod tests { columnar_writer.record_column_type("count", ColumnType::U64, false); columnar_writer.record_numerical(1, "count", 1u64); let mut buffer = Vec::new(); - columnar_writer.serialize(2, None, &mut buffer).unwrap(); + columnar_writer.serialize(2, &mut buffer).unwrap(); let columnar = ColumnarReader::open(buffer).unwrap(); let columns = columnar.list_columns().unwrap(); assert_eq!(columns.len(), 1); diff --git a/columnar/src/columnar/writer/column_writers.rs b/columnar/src/columnar/writer/column_writers.rs index e56cf2e9f2..e26e1ee2d8 100644 --- a/columnar/src/columnar/writer/column_writers.rs +++ b/columnar/src/columnar/writer/column_writers.rs @@ -41,31 +41,10 @@ impl ColumnWriter { pub(super) fn operation_iterator<'a, V: SymbolValue>( &self, arena: &MemoryArena, - old_to_new_ids_opt: Option<&[RowId]>, buffer: &'a mut Vec, ) -> impl Iterator> + 'a { buffer.clear(); self.values.read_to_end(arena, buffer); - if let Some(old_to_new_ids) = old_to_new_ids_opt { - // TODO avoid the extra deserialization / serialization. - let mut sorted_ops: Vec<(RowId, ColumnOperation)> = Vec::new(); - let mut new_doc = 0u32; - let mut cursor = &buffer[..]; - for op in std::iter::from_fn(|| ColumnOperation::::deserialize(&mut cursor)) { - if let ColumnOperation::NewDoc(doc) = &op { - new_doc = old_to_new_ids[*doc as usize]; - sorted_ops.push((new_doc, ColumnOperation::NewDoc(new_doc))); - } else { - sorted_ops.push((new_doc, op)); - } - } - // stable sort is crucial here. - sorted_ops.sort_by_key(|(new_doc_id, _)| *new_doc_id); - buffer.clear(); - for (_, op) in sorted_ops { - buffer.extend_from_slice(op.serialize().as_ref()); - } - } let mut cursor: &[u8] = &buffer[..]; std::iter::from_fn(move || ColumnOperation::deserialize(&mut cursor)) } @@ -231,11 +210,9 @@ impl NumericalColumnWriter { pub(super) fn operation_iterator<'a>( self, arena: &MemoryArena, - old_to_new_ids: Option<&[RowId]>, buffer: &'a mut Vec, ) -> impl Iterator> + 'a { - self.column_writer - .operation_iterator(arena, old_to_new_ids, buffer) + self.column_writer.operation_iterator(arena, buffer) } } @@ -277,11 +254,9 @@ impl StrOrBytesColumnWriter { pub(super) fn operation_iterator<'a>( &self, arena: &MemoryArena, - old_to_new_ids: Option<&[RowId]>, byte_buffer: &'a mut Vec, ) -> impl Iterator> + 'a { - self.column_writer - .operation_iterator(arena, old_to_new_ids, byte_buffer) + self.column_writer.operation_iterator(arena, byte_buffer) } } diff --git a/columnar/src/columnar/writer/mod.rs b/columnar/src/columnar/writer/mod.rs index 1fbc9d85de..d5fda430c7 100644 --- a/columnar/src/columnar/writer/mod.rs +++ b/columnar/src/columnar/writer/mod.rs @@ -43,7 +43,7 @@ struct SpareBuffers { /// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple"); /// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats. /// let mut wrt: Vec = Vec::new(); -/// columnar_writer.serialize(2u32, None, &mut wrt).unwrap(); +/// columnar_writer.serialize(2u32, &mut wrt).unwrap(); /// ``` #[derive(Default)] pub struct ColumnarWriter { @@ -75,63 +75,6 @@ impl ColumnarWriter { .sum::() } - /// Returns the list of doc ids from 0..num_docs sorted by the `sort_field` - /// column. - /// - /// If the column is multivalued, use the first value for scoring. - /// If no value is associated to a specific row, the document is assigned - /// the lowest possible score. - /// - /// The sort applied is stable. - pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec { - let Some(numerical_col_writer) = self - .numerical_field_hash_map - .get::(sort_field.as_bytes()) - .or_else(|| { - self.datetime_field_hash_map - .get::(sort_field.as_bytes()) - }) - else { - return Vec::new(); - }; - let mut symbols_buffer = Vec::new(); - let mut values = Vec::new(); - let mut start_doc_check_fill = 0; - let mut current_doc_opt: Option = None; - // Assumption: NewDoc will never call the same doc twice and is strictly increasing between - // calls - for op in numerical_col_writer.operation_iterator(&self.arena, None, &mut symbols_buffer) { - match op { - ColumnOperation::NewDoc(doc) => { - current_doc_opt = Some(doc); - } - ColumnOperation::Value(numerical_value) => { - if let Some(current_doc) = current_doc_opt { - // Fill up with 0.0 since last doc - values.extend((start_doc_check_fill..current_doc).map(|doc| (0.0, doc))); - start_doc_check_fill = current_doc + 1; - // handle multi values - current_doc_opt = None; - - let score: f32 = f64::coerce(numerical_value) as f32; - values.push((score, current_doc)); - } - } - } - } - for doc in values.len() as u32..num_docs { - values.push((0.0f32, doc)); - } - values.sort_by(|(left_score, _), (right_score, _)| { - if reversed { - right_score.total_cmp(left_score) - } else { - left_score.total_cmp(right_score) - } - }); - values.into_iter().map(|(_score, doc)| doc).collect() - } - /// Records a column type. This is useful to bypass the coercion process, /// makes sure the empty is present in the resulting columnar, or set /// the `sort_values_within_row`. @@ -302,12 +245,7 @@ impl ColumnarWriter { }, ); } - pub fn serialize( - &mut self, - num_docs: RowId, - old_to_new_row_ids: Option<&[RowId]>, - wrt: &mut dyn io::Write, - ) -> io::Result<()> { + pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> { let mut serializer = ColumnarSerializer::new(wrt); let mut columns: Vec<(&[u8], ColumnType, Addr)> = self .numerical_field_hash_map @@ -358,11 +296,7 @@ impl ColumnarWriter { serialize_bool_column( cardinality, num_docs, - column_writer.operation_iterator( - arena, - old_to_new_row_ids, - &mut symbol_byte_buffer, - ), + column_writer.operation_iterator(arena, &mut symbol_byte_buffer), buffers, &mut column_serializer, )?; @@ -376,11 +310,7 @@ impl ColumnarWriter { serialize_ip_addr_column( cardinality, num_docs, - column_writer.operation_iterator( - arena, - old_to_new_row_ids, - &mut symbol_byte_buffer, - ), + column_writer.operation_iterator(arena, &mut symbol_byte_buffer), buffers, &mut column_serializer, )?; @@ -405,11 +335,8 @@ impl ColumnarWriter { num_docs, str_or_bytes_column_writer.sort_values_within_row, dictionary_builder, - str_or_bytes_column_writer.operation_iterator( - arena, - old_to_new_row_ids, - &mut symbol_byte_buffer, - ), + str_or_bytes_column_writer + .operation_iterator(arena, &mut symbol_byte_buffer), buffers, &self.arena, &mut column_serializer, @@ -427,11 +354,7 @@ impl ColumnarWriter { cardinality, num_docs, numerical_type, - numerical_column_writer.operation_iterator( - arena, - old_to_new_row_ids, - &mut symbol_byte_buffer, - ), + numerical_column_writer.operation_iterator(arena, &mut symbol_byte_buffer), buffers, &mut column_serializer, )?; @@ -446,11 +369,7 @@ impl ColumnarWriter { cardinality, num_docs, NumericalType::I64, - column_writer.operation_iterator( - arena, - old_to_new_row_ids, - &mut symbol_byte_buffer, - ), + column_writer.operation_iterator(arena, &mut symbol_byte_buffer), buffers, &mut column_serializer, )?; @@ -757,7 +676,7 @@ mod tests { assert_eq!(column_writer.get_cardinality(3), Cardinality::Full); let mut buffer = Vec::new(); let symbols: Vec> = column_writer - .operation_iterator(&arena, None, &mut buffer) + .operation_iterator(&arena, &mut buffer) .collect(); assert_eq!(symbols.len(), 6); assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32))); @@ -786,7 +705,7 @@ mod tests { assert_eq!(column_writer.get_cardinality(3), Cardinality::Optional); let mut buffer = Vec::new(); let symbols: Vec> = column_writer - .operation_iterator(&arena, None, &mut buffer) + .operation_iterator(&arena, &mut buffer) .collect(); assert_eq!(symbols.len(), 4); assert!(matches!(symbols[0], ColumnOperation::NewDoc(1u32))); @@ -809,7 +728,7 @@ mod tests { assert_eq!(column_writer.get_cardinality(2), Cardinality::Optional); let mut buffer = Vec::new(); let symbols: Vec> = column_writer - .operation_iterator(&arena, None, &mut buffer) + .operation_iterator(&arena, &mut buffer) .collect(); assert_eq!(symbols.len(), 2); assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32))); @@ -828,7 +747,7 @@ mod tests { assert_eq!(column_writer.get_cardinality(1), Cardinality::Multivalued); let mut buffer = Vec::new(); let symbols: Vec> = column_writer - .operation_iterator(&arena, None, &mut buffer) + .operation_iterator(&arena, &mut buffer) .collect(); assert_eq!(symbols.len(), 3); assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32))); diff --git a/columnar/src/tests.rs b/columnar/src/tests.rs index 5e5c50f556..efdb9d050a 100644 --- a/columnar/src/tests.rs +++ b/columnar/src/tests.rs @@ -21,7 +21,7 @@ fn test_dataframe_writer_str() { dataframe_writer.record_str(1u32, "my_string", "hello"); dataframe_writer.record_str(3u32, "my_string", "helloeee"); let mut buffer: Vec = Vec::new(); - dataframe_writer.serialize(5, None, &mut buffer).unwrap(); + dataframe_writer.serialize(5, &mut buffer).unwrap(); let columnar = ColumnarReader::open(buffer).unwrap(); assert_eq!(columnar.num_columns(), 1); let cols: Vec = columnar.read_columns("my_string").unwrap(); @@ -35,7 +35,7 @@ fn test_dataframe_writer_bytes() { dataframe_writer.record_bytes(1u32, "my_string", b"hello"); dataframe_writer.record_bytes(3u32, "my_string", b"helloeee"); let mut buffer: Vec = Vec::new(); - dataframe_writer.serialize(5, None, &mut buffer).unwrap(); + dataframe_writer.serialize(5, &mut buffer).unwrap(); let columnar = ColumnarReader::open(buffer).unwrap(); assert_eq!(columnar.num_columns(), 1); let cols: Vec = columnar.read_columns("my_string").unwrap(); @@ -49,7 +49,7 @@ fn test_dataframe_writer_bool() { dataframe_writer.record_bool(1u32, "bool.value", false); dataframe_writer.record_bool(3u32, "bool.value", true); let mut buffer: Vec = Vec::new(); - dataframe_writer.serialize(5, None, &mut buffer).unwrap(); + dataframe_writer.serialize(5, &mut buffer).unwrap(); let columnar = ColumnarReader::open(buffer).unwrap(); assert_eq!(columnar.num_columns(), 1); let cols: Vec = columnar.read_columns("bool.value").unwrap(); @@ -74,7 +74,7 @@ fn test_dataframe_writer_u64_multivalued() { dataframe_writer.record_numerical(6u32, "divisor", 2u64); dataframe_writer.record_numerical(6u32, "divisor", 3u64); let mut buffer: Vec = Vec::new(); - dataframe_writer.serialize(7, None, &mut buffer).unwrap(); + dataframe_writer.serialize(7, &mut buffer).unwrap(); let columnar = ColumnarReader::open(buffer).unwrap(); assert_eq!(columnar.num_columns(), 1); let cols: Vec = columnar.read_columns("divisor").unwrap(); @@ -97,7 +97,7 @@ fn test_dataframe_writer_ip_addr() { dataframe_writer.record_ip_addr(1, "ip_addr", Ipv6Addr::from_u128(1001)); dataframe_writer.record_ip_addr(3, "ip_addr", Ipv6Addr::from_u128(1050)); let mut buffer: Vec = Vec::new(); - dataframe_writer.serialize(5, None, &mut buffer).unwrap(); + dataframe_writer.serialize(5, &mut buffer).unwrap(); let columnar = ColumnarReader::open(buffer).unwrap(); assert_eq!(columnar.num_columns(), 1); let cols: Vec = columnar.read_columns("ip_addr").unwrap(); @@ -128,7 +128,7 @@ fn test_dataframe_writer_numerical() { dataframe_writer.record_numerical(2u32, "srical.value", NumericalValue::U64(13u64)); dataframe_writer.record_numerical(4u32, "srical.value", NumericalValue::U64(15u64)); let mut buffer: Vec = Vec::new(); - dataframe_writer.serialize(6, None, &mut buffer).unwrap(); + dataframe_writer.serialize(6, &mut buffer).unwrap(); let columnar = ColumnarReader::open(buffer).unwrap(); assert_eq!(columnar.num_columns(), 1); let cols: Vec = columnar.read_columns("srical.value").unwrap(); @@ -153,46 +153,6 @@ fn test_dataframe_writer_numerical() { assert_eq!(column_i64.first(6), None); //< we can change the spec for that one. } -#[test] -fn test_dataframe_sort_by_full() { - let mut dataframe_writer = ColumnarWriter::default(); - dataframe_writer.record_numerical(0u32, "value", NumericalValue::U64(1)); - dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2)); - let data = dataframe_writer.sort_order("value", 2, false); - assert_eq!(data, vec![0, 1]); -} - -#[test] -fn test_dataframe_sort_by_opt() { - let mut dataframe_writer = ColumnarWriter::default(); - dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(3)); - dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(2)); - let data = dataframe_writer.sort_order("value", 5, false); - // 0, 2, 4 is 0.0 - assert_eq!(data, vec![0, 2, 4, 3, 1]); - let data = dataframe_writer.sort_order("value", 5, true); - assert_eq!( - data, - vec![4, 2, 0, 3, 1].into_iter().rev().collect::>() - ); -} - -#[test] -fn test_dataframe_sort_by_multi() { - let mut dataframe_writer = ColumnarWriter::default(); - // valid for sort - dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2)); - // those are ignored for sort - dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4)); - dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4)); - // valid for sort - dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(3)); - // ignored, would change sort order - dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(1)); - let data = dataframe_writer.sort_order("value", 4, false); - assert_eq!(data, vec![0, 2, 1, 3]); -} - #[test] fn test_dictionary_encoded_str() { let mut buffer = Vec::new(); @@ -201,7 +161,7 @@ fn test_dictionary_encoded_str() { columnar_writer.record_str(3, "my.column", "c"); columnar_writer.record_str(3, "my.column2", "different_column!"); columnar_writer.record_str(4, "my.column", "b"); - columnar_writer.serialize(5, None, &mut buffer).unwrap(); + columnar_writer.serialize(5, &mut buffer).unwrap(); let columnar_reader = ColumnarReader::open(buffer).unwrap(); assert_eq!(columnar_reader.num_columns(), 2); let col_handles = columnar_reader.read_columns("my.column").unwrap(); @@ -235,7 +195,7 @@ fn test_dictionary_encoded_bytes() { columnar_writer.record_bytes(3, "my.column", b"c"); columnar_writer.record_bytes(3, "my.column2", b"different_column!"); columnar_writer.record_bytes(4, "my.column", b"b"); - columnar_writer.serialize(5, None, &mut buffer).unwrap(); + columnar_writer.serialize(5, &mut buffer).unwrap(); let columnar_reader = ColumnarReader::open(buffer).unwrap(); assert_eq!(columnar_reader.num_columns(), 2); let col_handles = columnar_reader.read_columns("my.column").unwrap(); @@ -369,26 +329,12 @@ fn columnar_docs_strategy() -> impl Strategy impl Strategy>, Vec)> { - columnar_docs_strategy().prop_flat_map(|docs| { - permutation_strategy(docs.len()).prop_map(move |permutation| (docs.clone(), permutation)) - }) -} - -fn permutation_strategy(n: usize) -> impl Strategy> { - Just((0u32..n as RowId).collect()).prop_shuffle() -} - fn permutation_and_subset_strategy(n: usize) -> impl Strategy> { let vals: Vec = (0..n).collect(); subsequence(vals, 0..=n).prop_shuffle() } -fn build_columnar_with_mapping( - docs: &[Vec<(&'static str, ColumnValue)>], - old_to_new_row_ids_opt: Option<&[RowId]>, -) -> ColumnarReader { +fn build_columnar_with_mapping(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader { let num_docs = docs.len() as u32; let mut buffer = Vec::new(); let mut columnar_writer = ColumnarWriter::default(); @@ -416,15 +362,13 @@ fn build_columnar_with_mapping( } } } - columnar_writer - .serialize(num_docs, old_to_new_row_ids_opt, &mut buffer) - .unwrap(); + columnar_writer.serialize(num_docs, &mut buffer).unwrap(); ColumnarReader::open(buffer).unwrap() } fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader { - build_columnar_with_mapping(docs, None) + build_columnar_with_mapping(docs) } fn assert_columnar_eq_strict(left: &ColumnarReader, right: &ColumnarReader) { @@ -683,54 +627,6 @@ proptest! { } } -// Same as `test_single_columnar_builder_proptest` but with a shuffling mapping. -proptest! { - #![proptest_config(ProptestConfig::with_cases(500))] - #[test] - fn test_single_columnar_builder_with_shuffle_proptest((docs, mapping) in columnar_docs_and_mapping_strategy()) { - let columnar = build_columnar_with_mapping(&docs[..], Some(&mapping)); - assert_eq!(columnar.num_rows() as usize, docs.len()); - let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap> > = Default::default(); - for (doc_id, doc_vals) in docs.iter().enumerate() { - for (col_name, col_val) in doc_vals { - expected_columns - .entry((col_name, col_val.column_type_category())) - .or_default() - .entry(mapping[doc_id]) - .or_default() - .push(col_val); - } - } - let column_list = columnar.list_columns().unwrap(); - assert_eq!(expected_columns.len(), column_list.len()); - for (column_name, column) in column_list { - let dynamic_column = column.open().unwrap(); - let col_category: ColumnTypeCategory = dynamic_column.column_type().into(); - let expected_col_values: &HashMap> = expected_columns.get(&(column_name.as_str(), col_category)).unwrap(); - for _doc_id in 0..columnar.num_rows() { - match &dynamic_column { - DynamicColumn::Bool(col) => - assert_column_values(col, expected_col_values), - DynamicColumn::I64(col) => - assert_column_values(col, expected_col_values), - DynamicColumn::U64(col) => - assert_column_values(col, expected_col_values), - DynamicColumn::F64(col) => - assert_column_values(col, expected_col_values), - DynamicColumn::IpAddr(col) => - assert_column_values(col, expected_col_values), - DynamicColumn::DateTime(col) => - assert_column_values(col, expected_col_values), - DynamicColumn::Bytes(col) => - assert_bytes_column_values(col, expected_col_values, false), - DynamicColumn::Str(col) => - assert_bytes_column_values(col, expected_col_values, true), - } - } - } - } -} - // This tests create 2 or 3 random small columnar and attempts to merge them. // It compares the resulting merged dataframe with what would have been obtained by building the // dataframe from the concatenated rows to begin with. diff --git a/doc/src/index_sorting.md b/doc/src/index_sorting.md index c2c430f89e..e2795af9e1 100644 --- a/doc/src/index_sorting.md +++ b/doc/src/index_sorting.md @@ -7,6 +7,11 @@ - [Other](#other) - [Usage](#usage) +# Index Sorting has been removed! +More infos here: + +https://github.com/quickwit-oss/tantivy/issues/2352 + # Index Sorting Tantivy allows you to sort the index according to a property. diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index e7ad2e413e..b4549cf08a 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -127,7 +127,7 @@ mod tests { fast_field_writers .add_document(&doc!(*FIELD=>2u64)) .unwrap(); - fast_field_writers.serialize(&mut write, None).unwrap(); + fast_field_writers.serialize(&mut write).unwrap(); write.terminate().unwrap(); } let file = directory.open_read(path).unwrap(); @@ -178,7 +178,7 @@ mod tests { fast_field_writers .add_document(&doc!(*FIELD=>215u64)) .unwrap(); - fast_field_writers.serialize(&mut write, None).unwrap(); + fast_field_writers.serialize(&mut write).unwrap(); write.terminate().unwrap(); } let file = directory.open_read(path).unwrap(); @@ -211,7 +211,7 @@ mod tests { .add_document(&doc!(*FIELD=>100_000u64)) .unwrap(); } - fast_field_writers.serialize(&mut write, None).unwrap(); + fast_field_writers.serialize(&mut write).unwrap(); write.terminate().unwrap(); } let file = directory.open_read(path).unwrap(); @@ -243,7 +243,7 @@ mod tests { .add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + doc_id)) .unwrap(); } - fast_field_writers.serialize(&mut write, None).unwrap(); + fast_field_writers.serialize(&mut write).unwrap(); write.terminate().unwrap(); } let file = directory.open_read(path).unwrap(); @@ -276,7 +276,7 @@ mod tests { doc.add_i64(i64_field, i); fast_field_writers.add_document(&doc).unwrap(); } - fast_field_writers.serialize(&mut write, None).unwrap(); + fast_field_writers.serialize(&mut write).unwrap(); write.terminate().unwrap(); } let file = directory.open_read(path).unwrap(); @@ -315,7 +315,7 @@ mod tests { let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); let doc = TantivyDocument::default(); fast_field_writers.add_document(&doc).unwrap(); - fast_field_writers.serialize(&mut write, None).unwrap(); + fast_field_writers.serialize(&mut write).unwrap(); write.terminate().unwrap(); } @@ -348,7 +348,7 @@ mod tests { let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); let doc = TantivyDocument::default(); fast_field_writers.add_document(&doc).unwrap(); - fast_field_writers.serialize(&mut write, None).unwrap(); + fast_field_writers.serialize(&mut write).unwrap(); write.terminate().unwrap(); } @@ -385,7 +385,7 @@ mod tests { for &x in &permutation { fast_field_writers.add_document(&doc!(*FIELD=>x)).unwrap(); } - fast_field_writers.serialize(&mut write, None).unwrap(); + fast_field_writers.serialize(&mut write).unwrap(); write.terminate().unwrap(); } let file = directory.open_read(path).unwrap(); @@ -770,7 +770,7 @@ mod tests { fast_field_writers .add_document(&doc!(field=>false)) .unwrap(); - fast_field_writers.serialize(&mut write, None).unwrap(); + fast_field_writers.serialize(&mut write).unwrap(); write.terminate().unwrap(); } let file = directory.open_read(path).unwrap(); @@ -802,7 +802,7 @@ mod tests { .add_document(&doc!(field=>false)) .unwrap(); } - fast_field_writers.serialize(&mut write, None).unwrap(); + fast_field_writers.serialize(&mut write).unwrap(); write.terminate().unwrap(); } let file = directory.open_read(path).unwrap(); @@ -827,7 +827,7 @@ mod tests { let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap(); let doc = TantivyDocument::default(); fast_field_writers.add_document(&doc).unwrap(); - fast_field_writers.serialize(&mut write, None).unwrap(); + fast_field_writers.serialize(&mut write).unwrap(); write.terminate().unwrap(); } let file = directory.open_read(path).unwrap(); @@ -855,7 +855,7 @@ mod tests { for doc in docs { fast_field_writers.add_document(doc).unwrap(); } - fast_field_writers.serialize(&mut write, None).unwrap(); + fast_field_writers.serialize(&mut write).unwrap(); write.terminate().unwrap(); } Ok(directory) diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 2f4196078e..a1288e0ad1 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -4,7 +4,6 @@ use columnar::{ColumnarWriter, NumericalValue}; use common::{DateTimePrecision, JsonPathWriter}; use tokenizer_api::Token; -use crate::indexer::doc_id_mapping::DocIdMapping; use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value}; use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type}; use crate::tokenizer::{TextAnalyzer, TokenizerManager}; @@ -106,16 +105,6 @@ impl FastFieldsWriter { self.columnar_writer.mem_usage() } - pub(crate) fn sort_order( - &self, - sort_field: &str, - num_docs: DocId, - reversed: bool, - ) -> Vec { - self.columnar_writer - .sort_order(sort_field, num_docs, reversed) - } - /// Indexes all of the fastfields of a new document. pub fn add_document(&mut self, doc: &D) -> crate::Result<()> { let doc_id = self.num_docs; @@ -233,16 +222,9 @@ impl FastFieldsWriter { /// Serializes all of the `FastFieldWriter`s by pushing them in /// order to the fast field serializer. - pub fn serialize( - mut self, - wrt: &mut dyn io::Write, - doc_id_map_opt: Option<&DocIdMapping>, - ) -> io::Result<()> { + pub fn serialize(mut self, wrt: &mut dyn io::Write) -> io::Result<()> { let num_docs = self.num_docs; - let old_to_new_row_ids = - doc_id_map_opt.map(|doc_id_mapping| doc_id_mapping.old_to_new_ids()); - self.columnar_writer - .serialize(num_docs, old_to_new_row_ids, wrt)?; + self.columnar_writer.serialize(num_docs, wrt)?; Ok(()) } } @@ -392,7 +374,7 @@ mod tests { } let mut buffer = Vec::new(); columnar_writer - .serialize(json_docs.len() as DocId, None, &mut buffer) + .serialize(json_docs.len() as DocId, &mut buffer) .unwrap(); ColumnarReader::open(buffer).unwrap() } diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs index 75fcff1e37..32c957d1ed 100644 --- a/src/fieldnorm/mod.rs +++ b/src/fieldnorm/mod.rs @@ -77,7 +77,7 @@ mod tests { let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA); fieldnorm_writers.record(2u32, *TXT_FIELD, 5); fieldnorm_writers.record(3u32, *TXT_FIELD, 3); - fieldnorm_writers.serialize(serializer, None)?; + fieldnorm_writers.serialize(serializer)?; } let file = directory.open_read(path)?; { diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index 2437319583..39bd50fee8 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -2,7 +2,6 @@ use std::cmp::Ordering; use std::{io, iter}; use super::{fieldnorm_to_id, FieldNormsSerializer}; -use crate::indexer::doc_id_mapping::DocIdMapping; use crate::schema::{Field, Schema}; use crate::DocId; @@ -92,11 +91,7 @@ impl FieldNormsWriter { } /// Serialize the seen fieldnorm values to the serializer for all fields. - pub fn serialize( - &self, - mut fieldnorms_serializer: FieldNormsSerializer, - doc_id_map: Option<&DocIdMapping>, - ) -> io::Result<()> { + pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> { for (field, fieldnorms_buffer) in self.fieldnorms_buffers.iter().enumerate().filter_map( |(field_id, fieldnorms_buffer_opt)| { fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| { @@ -104,12 +99,7 @@ impl FieldNormsWriter { }) }, ) { - if let Some(doc_id_map) = doc_id_map { - let remapped_fieldnorm_buffer = doc_id_map.remap(fieldnorms_buffer); - fieldnorms_serializer.serialize_field(field, &remapped_fieldnorm_buffer)?; - } else { - fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?; - } + fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?; } fieldnorms_serializer.close()?; Ok(()) diff --git a/src/functional_test.rs b/src/functional_test.rs index 9182adf799..2814d8e302 100644 --- a/src/functional_test.rs +++ b/src/functional_test.rs @@ -7,7 +7,7 @@ use rand::{thread_rng, Rng}; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; use crate::schema::*; #[allow(deprecated)] -use crate::{doc, schema, Index, IndexSettings, IndexSortByField, IndexWriter, Order, Searcher}; +use crate::{doc, schema, Index, IndexWriter, Searcher}; fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> { assert!(searcher.segment_readers().len() < 20); @@ -65,71 +65,6 @@ fn get_num_iterations() -> usize { .map(|str| str.parse().unwrap()) .unwrap_or(2000) } -#[test] -#[ignore] -fn test_functional_indexing_sorted() -> crate::Result<()> { - let mut schema_builder = Schema::builder(); - - let id_field = schema_builder.add_u64_field("id", INDEXED | FAST); - let multiples_field = schema_builder.add_u64_field("multiples", INDEXED); - let text_field_options = TextOptions::default() - .set_indexing_options( - TextFieldIndexing::default() - .set_index_option(schema::IndexRecordOption::WithFreqsAndPositions), - ) - .set_stored(); - let text_field = schema_builder.add_text_field("text_field", text_field_options); - let schema = schema_builder.build(); - - let mut index_builder = Index::builder().schema(schema); - index_builder = index_builder.settings(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "id".to_string(), - order: Order::Desc, - }), - ..Default::default() - }); - let index = index_builder.create_from_tempdir().unwrap(); - - let reader = index.reader()?; - - let mut rng = thread_rng(); - - let mut index_writer: IndexWriter = - index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?; - - let mut committed_docs: HashSet = HashSet::new(); - let mut uncommitted_docs: HashSet = HashSet::new(); - - for _ in 0..get_num_iterations() { - let random_val = rng.gen_range(0..20); - if random_val == 0 { - index_writer.commit()?; - committed_docs.extend(&uncommitted_docs); - uncommitted_docs.clear(); - reader.reload()?; - let searcher = reader.searcher(); - // check that everything is correct. - check_index_content( - &searcher, - &committed_docs.iter().cloned().collect::>(), - )?; - } else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) { - let doc_id_term = Term::from_field_u64(id_field, random_val); - index_writer.delete_term(doc_id_term); - } else { - uncommitted_docs.insert(random_val); - let mut doc = TantivyDocument::new(); - doc.add_u64(id_field, random_val); - for i in 1u64..10u64 { - doc.add_u64(multiples_field, random_val * i); - } - doc.add_text(text_field, get_text()); - index_writer.add_document(doc)?; - } - } - Ok(()) -} const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod \ tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, \ diff --git a/src/index/index.rs b/src/index/index.rs index aab055409e..250181db10 100644 --- a/src/index/index.rs +++ b/src/index/index.rs @@ -20,7 +20,7 @@ use crate::indexer::segment_updater::save_metas; use crate::indexer::{IndexWriter, SingleSegmentIndexWriter}; use crate::reader::{IndexReader, IndexReaderBuilder}; use crate::schema::document::Document; -use crate::schema::{Field, FieldType, Schema, Type}; +use crate::schema::{Field, FieldType, Schema}; use crate::tokenizer::{TextAnalyzer, TokenizerManager}; use crate::SegmentReader; @@ -232,31 +232,7 @@ impl IndexBuilder { } fn validate(&self) -> crate::Result<()> { - if let Some(schema) = self.schema.as_ref() { - if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref() { - let schema_field = schema.get_field(&sort_by_field.field).map_err(|_| { - TantivyError::InvalidArgument(format!( - "Field to sort index {} not found in schema", - sort_by_field.field - )) - })?; - let entry = schema.get_field_entry(schema_field); - if !entry.is_fast() { - return Err(TantivyError::InvalidArgument(format!( - "Field {} is no fast field. Field needs to be a single value fast field \ - to be used to sort an index", - sort_by_field.field - ))); - } - let supported_field_types = [Type::I64, Type::U64, Type::F64, Type::Date]; - let field_type = entry.field_type().value_type(); - if !supported_field_types.contains(&field_type) { - return Err(TantivyError::InvalidArgument(format!( - "Unsupported field type in sort_by_field: {field_type:?}. Supported field \ - types: {supported_field_types:?} ", - ))); - } - } + if let Some(_schema) = self.schema.as_ref() { Ok(()) } else { Err(TantivyError::InvalidArgument( diff --git a/src/index/index_meta.rs b/src/index/index_meta.rs index f478ac2fde..49ef68bad3 100644 --- a/src/index/index_meta.rs +++ b/src/index/index_meta.rs @@ -249,10 +249,6 @@ fn is_true(val: &bool) -> bool { /// index, like presort documents. #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] pub struct IndexSettings { - /// Sorts the documents by information - /// provided in `IndexSortByField` - #[serde(skip_serializing_if = "Option::is_none")] - pub sort_by_field: Option, /// The `Compressor` used to compress the doc store. #[serde(default)] pub docstore_compression: Compressor, @@ -275,7 +271,6 @@ fn default_docstore_blocksize() -> usize { impl Default for IndexSettings { fn default() -> Self { Self { - sort_by_field: None, docstore_compression: Compressor::default(), docstore_blocksize: default_docstore_blocksize(), docstore_compress_dedicated_thread: true, @@ -283,22 +278,6 @@ impl Default for IndexSettings { } } -/// Settings to presort the documents in an index -/// -/// Presorting documents can greatly improve performance -/// in some scenarios, by applying top n -/// optimizations. -#[deprecated( - since = "0.22.0", - note = "We plan to remove index sorting in `0.23`. If you need index sorting, please comment on the related issue https://github.com/quickwit-oss/tantivy/issues/2352 and explain your use case." -)] -#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] -pub struct IndexSortByField { - /// The field to sort the documents by - pub field: String, - /// The order to sort the documents by - pub order: Order, -} /// The order to sort by #[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)] pub enum Order { @@ -417,7 +396,7 @@ mod tests { use crate::store::Compressor; #[cfg(feature = "zstd-compression")] use crate::store::ZstdCompressor; - use crate::{IndexSettings, IndexSortByField, Order}; + use crate::IndexSettings; #[test] fn test_serialize_metas() { @@ -427,13 +406,7 @@ mod tests { schema_builder.build() }; let index_metas = IndexMeta { - index_settings: IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "text".to_string(), - order: Order::Asc, - }), - ..Default::default() - }, + index_settings: IndexSettings::default(), segments: Vec::new(), schema, opstamp: 0u64, @@ -442,7 +415,7 @@ mod tests { let json = serde_json::ser::to_string(&index_metas).expect("serialization failed"); assert_eq!( json, - r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"# + r#"{"index_settings":{"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"# ); let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap(); @@ -461,10 +434,6 @@ mod tests { }; let index_metas = IndexMeta { index_settings: IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "text".to_string(), - order: Order::Asc, - }), docstore_compression: crate::store::Compressor::Zstd(ZstdCompressor { compression_level: Some(4), }), @@ -479,7 +448,7 @@ mod tests { let json = serde_json::ser::to_string(&index_metas).expect("serialization failed"); assert_eq!( json, - r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"# + r#"{"index_settings":{"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"# ); let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap(); @@ -491,35 +460,35 @@ mod tests { #[test] #[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))] fn test_serialize_metas_invalid_comp() { - let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#; + let json = r#"{"index_settings":{"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#; let err = serde_json::from_str::(json).unwrap_err(); assert_eq!( err.to_string(), "unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \ - `zstd(compression_level=5)` at line 1 column 96" + `zstd(compression_level=5)` at line 1 column 49" .to_string() ); - let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(bla=10)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#; + let json = r#"{"index_settings":{"docstore_compression":"zstd(bla=10)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#; let err = serde_json::from_str::(json).unwrap_err(); assert_eq!( err.to_string(), - "unknown zstd option \"bla\" at line 1 column 103".to_string() + "unknown zstd option \"bla\" at line 1 column 56".to_string() ); } #[test] #[cfg(not(feature = "zstd-compression"))] fn test_serialize_metas_unsupported_comp() { - let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#; + let json = r#"{"index_settings":{"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#; let err = serde_json::from_str::(json).unwrap_err(); assert_eq!( err.to_string(), "unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \ - line 1 column 95" + line 1 column 48" .to_string() ); } @@ -531,7 +500,6 @@ mod tests { assert_eq!( index_settings, IndexSettings { - sort_by_field: None, docstore_compression: Compressor::default(), docstore_compress_dedicated_thread: true, docstore_blocksize: 16_384 diff --git a/src/index/mod.rs b/src/index/mod.rs index fcd9a14751..76dc3ed9b6 100644 --- a/src/index/mod.rs +++ b/src/index/mod.rs @@ -12,7 +12,7 @@ mod segment_reader; pub use self::index::{Index, IndexBuilder}; pub(crate) use self::index_meta::SegmentMetaInventory; -pub use self::index_meta::{IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta}; +pub use self::index_meta::{IndexMeta, IndexSettings, Order, SegmentMeta}; pub use self::inverted_index_reader::InvertedIndexReader; pub use self::segment::Segment; pub use self::segment_component::SegmentComponent; diff --git a/src/indexer/doc_id_mapping.rs b/src/indexer/doc_id_mapping.rs index b3c1ea2f08..c4bc6b65be 100644 --- a/src/indexer/doc_id_mapping.rs +++ b/src/indexer/doc_id_mapping.rs @@ -3,15 +3,12 @@ use common::ReadOnlyBitSet; -use super::SegmentWriter; -use crate::schema::{Field, Schema}; -use crate::{DocAddress, DocId, IndexSortByField, TantivyError}; +use crate::DocAddress; #[derive(Copy, Clone, Eq, PartialEq)] pub enum MappingType { Stacked, StackedWithDeletes, - Shuffled, } /// Struct to provide mapping from new doc_id to old doc_id and segment. @@ -46,537 +43,4 @@ impl SegmentDocIdMapping { pub(crate) fn iter_old_doc_addrs(&self) -> impl Iterator + '_ { self.new_doc_id_to_old_doc_addr.iter().copied() } - - /// This flags means the segments are simply stacked in the order of their ordinal. - /// e.g. [(0, 1), .. (n, 1), (0, 2)..., (m, 2)] - /// - /// The different segment may present some deletes, in which case it is expressed by skipping a - /// `DocId`. [(0, 1), (0, 3)] <--- here doc_id=0 and doc_id=1 have been deleted - /// - /// Being trivial is equivalent to having the `new_doc_id_to_old_doc_addr` array sorted. - /// - /// This allows for some optimization. - pub(crate) fn is_trivial(&self) -> bool { - match self.mapping_type { - MappingType::Stacked | MappingType::StackedWithDeletes => true, - MappingType::Shuffled => false, - } - } -} - -/// Struct to provide mapping from old doc_id to new doc_id and vice versa within a segment. -pub struct DocIdMapping { - new_doc_id_to_old: Vec, - old_doc_id_to_new: Vec, -} - -impl DocIdMapping { - pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec) -> Self { - let max_doc = new_doc_id_to_old.len(); - let old_max_doc = new_doc_id_to_old - .iter() - .cloned() - .max() - .map(|n| n + 1) - .unwrap_or(0); - let mut old_doc_id_to_new = vec![0; old_max_doc as usize]; - for i in 0..max_doc { - old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId; - } - DocIdMapping { - new_doc_id_to_old, - old_doc_id_to_new, - } - } - - /// returns the new doc_id for the old doc_id - pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId { - self.old_doc_id_to_new[doc_id as usize] - } - /// returns the old doc_id for the new doc_id - pub fn get_old_doc_id(&self, doc_id: DocId) -> DocId { - self.new_doc_id_to_old[doc_id as usize] - } - /// iterate over old doc_ids in order of the new doc_ids - pub fn iter_old_doc_ids(&self) -> impl Iterator + Clone + '_ { - self.new_doc_id_to_old.iter().cloned() - } - - pub fn old_to_new_ids(&self) -> &[DocId] { - &self.old_doc_id_to_new[..] - } - - /// Remaps a given array to the new doc ids. - pub fn remap(&self, els: &[T]) -> Vec { - self.new_doc_id_to_old - .iter() - .map(|old_doc| els[*old_doc as usize]) - .collect() - } - pub fn num_new_doc_ids(&self) -> usize { - self.new_doc_id_to_old.len() - } - pub fn num_old_doc_ids(&self) -> usize { - self.old_doc_id_to_new.len() - } -} - -pub(crate) fn expect_field_id_for_sort_field( - schema: &Schema, - sort_by_field: &IndexSortByField, -) -> crate::Result { - schema.get_field(&sort_by_field.field).map_err(|_| { - TantivyError::InvalidArgument(format!( - "field to sort index by not found: {:?}", - sort_by_field.field - )) - }) -} - -// Generates a document mapping in the form of [index new doc_id] -> old doc_id -// TODO detect if field is already sorted and discard mapping -pub(crate) fn get_doc_id_mapping_from_field( - sort_by_field: IndexSortByField, - segment_writer: &SegmentWriter, -) -> crate::Result { - let schema = segment_writer.segment_serializer.segment().schema(); - expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect - let new_doc_id_to_old = segment_writer.fast_field_writers.sort_order( - sort_by_field.field.as_str(), - segment_writer.max_doc(), - sort_by_field.order.is_desc(), - ); - // create new doc_id to old doc_id index (used in fast_field_writers) - Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old)) -} - -#[cfg(test)] -mod tests_indexsorting { - use common::DateTime; - - use crate::collector::TopDocs; - use crate::indexer::doc_id_mapping::DocIdMapping; - use crate::indexer::NoMergePolicy; - use crate::query::QueryParser; - use crate::schema::*; - use crate::{DocAddress, Index, IndexBuilder, IndexSettings, IndexSortByField, Order}; - - fn create_test_index( - index_settings: Option, - text_field_options: TextOptions, - ) -> crate::Result { - let mut schema_builder = Schema::builder(); - - let my_text_field = schema_builder.add_text_field("text_field", text_field_options); - let my_string_field = schema_builder.add_text_field("string_field", STRING | STORED); - let my_number = - schema_builder.add_u64_field("my_number", NumericOptions::default().set_fast()); - - let multi_numbers = - schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast()); - - let schema = schema_builder.build(); - let mut index_builder = Index::builder().schema(schema); - if let Some(settings) = index_settings { - index_builder = index_builder.settings(settings); - } - let index = index_builder.create_in_ram()?; - - let mut index_writer = index.writer_for_tests()?; - index_writer.add_document(doc!(my_number=>40_u64))?; - index_writer.add_document( - doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64), - )?; - index_writer.add_document(doc!(my_number=>100_u64))?; - index_writer.add_document( - doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"), - )?; - index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ))?; - index_writer.commit()?; - Ok(index) - } - fn get_text_options() -> TextOptions { - TextOptions::default().set_indexing_options( - TextFieldIndexing::default().set_index_option(IndexRecordOption::Basic), - ) - } - #[test] - fn test_sort_index_test_text_field() -> crate::Result<()> { - // there are different serializers for different settings in postings/recorder.rs - // test remapping for all of them - let options = vec![ - get_text_options(), - get_text_options().set_indexing_options( - TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), - ), - get_text_options().set_indexing_options( - TextFieldIndexing::default() - .set_index_option(IndexRecordOption::WithFreqsAndPositions), - ), - ]; - - for option in options { - // let options = get_text_options(); - // no index_sort - let index = create_test_index(None, option.clone())?; - let my_text_field = index.schema().get_field("text_field").unwrap(); - let searcher = index.reader()?.searcher(); - - let query = QueryParser::for_index(&index, vec![my_text_field]).parse_query("text")?; - let top_docs: Vec<(f32, DocAddress)> = - searcher.search(&query, &TopDocs::with_limit(3))?; - assert_eq!( - top_docs.iter().map(|el| el.1.doc_id).collect::>(), - vec![3] - ); - - // sort by field asc - let index = create_test_index( - Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "my_number".to_string(), - order: Order::Asc, - }), - ..Default::default() - }), - option.clone(), - )?; - let my_text_field = index.schema().get_field("text_field").unwrap(); - let reader = index.reader()?; - let searcher = reader.searcher(); - - let query = QueryParser::for_index(&index, vec![my_text_field]).parse_query("text")?; - let top_docs: Vec<(f32, DocAddress)> = - searcher.search(&query, &TopDocs::with_limit(3))?; - assert_eq!( - top_docs.iter().map(|el| el.1.doc_id).collect::>(), - vec![0] - ); - - // test new field norm mapping - { - let my_text_field = index.schema().get_field("text_field").unwrap(); - let fieldnorm_reader = searcher - .segment_reader(0) - .get_fieldnorms_reader(my_text_field)?; - assert_eq!(fieldnorm_reader.fieldnorm(0), 2); // some text - assert_eq!(fieldnorm_reader.fieldnorm(1), 0); - } - // sort by field desc - let index = create_test_index( - Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "my_number".to_string(), - order: Order::Desc, - }), - ..Default::default() - }), - option.clone(), - )?; - let my_string_field = index.schema().get_field("text_field").unwrap(); - let searcher = index.reader()?.searcher(); - - let query = - QueryParser::for_index(&index, vec![my_string_field]).parse_query("text")?; - let top_docs: Vec<(f32, DocAddress)> = - searcher.search(&query, &TopDocs::with_limit(3))?; - assert_eq!( - top_docs.iter().map(|el| el.1.doc_id).collect::>(), - vec![4] - ); - // test new field norm mapping - { - let my_text_field = index.schema().get_field("text_field").unwrap(); - let fieldnorm_reader = searcher - .segment_reader(0) - .get_fieldnorms_reader(my_text_field)?; - assert_eq!(fieldnorm_reader.fieldnorm(0), 0); - assert_eq!(fieldnorm_reader.fieldnorm(1), 0); - assert_eq!(fieldnorm_reader.fieldnorm(2), 0); - assert_eq!(fieldnorm_reader.fieldnorm(3), 0); - assert_eq!(fieldnorm_reader.fieldnorm(4), 2); // some text - } - } - Ok(()) - } - #[test] - fn test_sort_index_get_documents() -> crate::Result<()> { - // default baseline - let index = create_test_index(None, get_text_options())?; - let my_string_field = index.schema().get_field("string_field").unwrap(); - let searcher = index.reader()?.searcher(); - { - assert!(searcher - .doc::(DocAddress::new(0, 0))? - .get_first(my_string_field) - .is_none()); - assert_eq!( - searcher - .doc::(DocAddress::new(0, 3))? - .get_first(my_string_field) - .unwrap() - .as_str(), - Some("blublub") - ); - } - // sort by field asc - let index = create_test_index( - Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "my_number".to_string(), - order: Order::Asc, - }), - ..Default::default() - }), - get_text_options(), - )?; - let my_string_field = index.schema().get_field("string_field").unwrap(); - let searcher = index.reader()?.searcher(); - { - assert_eq!( - searcher - .doc::(DocAddress::new(0, 0))? - .get_first(my_string_field) - .unwrap() - .as_str(), - Some("blublub") - ); - let doc = searcher.doc::(DocAddress::new(0, 4))?; - assert!(doc.get_first(my_string_field).is_none()); - } - // sort by field desc - let index = create_test_index( - Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "my_number".to_string(), - order: Order::Desc, - }), - ..Default::default() - }), - get_text_options(), - )?; - let my_string_field = index.schema().get_field("string_field").unwrap(); - let searcher = index.reader()?.searcher(); - { - let doc = searcher.doc::(DocAddress::new(0, 4))?; - assert_eq!( - doc.get_first(my_string_field).unwrap().as_str(), - Some("blublub") - ); - } - Ok(()) - } - - #[test] - fn test_sort_index_test_string_field() -> crate::Result<()> { - let index = create_test_index(None, get_text_options())?; - let my_string_field = index.schema().get_field("string_field").unwrap(); - let searcher = index.reader()?.searcher(); - - let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?; - let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?; - assert_eq!( - top_docs.iter().map(|el| el.1.doc_id).collect::>(), - vec![3] - ); - - let index = create_test_index( - Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "my_number".to_string(), - order: Order::Asc, - }), - ..Default::default() - }), - get_text_options(), - )?; - let my_string_field = index.schema().get_field("string_field").unwrap(); - let reader = index.reader()?; - let searcher = reader.searcher(); - - let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?; - let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?; - assert_eq!( - top_docs.iter().map(|el| el.1.doc_id).collect::>(), - vec![0] - ); - - // test new field norm mapping - { - let my_text_field = index.schema().get_field("text_field").unwrap(); - let fieldnorm_reader = searcher - .segment_reader(0) - .get_fieldnorms_reader(my_text_field)?; - assert_eq!(fieldnorm_reader.fieldnorm(0), 2); // some text - assert_eq!(fieldnorm_reader.fieldnorm(1), 0); - } - // sort by field desc - let index = create_test_index( - Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "my_number".to_string(), - order: Order::Desc, - }), - ..Default::default() - }), - get_text_options(), - )?; - let my_string_field = index.schema().get_field("string_field").unwrap(); - let searcher = index.reader()?.searcher(); - - let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?; - let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?; - assert_eq!( - top_docs.iter().map(|el| el.1.doc_id).collect::>(), - vec![4] - ); - // test new field norm mapping - { - let my_text_field = index.schema().get_field("text_field").unwrap(); - let fieldnorm_reader = searcher - .segment_reader(0) - .get_fieldnorms_reader(my_text_field)?; - assert_eq!(fieldnorm_reader.fieldnorm(0), 0); - assert_eq!(fieldnorm_reader.fieldnorm(1), 0); - assert_eq!(fieldnorm_reader.fieldnorm(2), 0); - assert_eq!(fieldnorm_reader.fieldnorm(3), 0); - assert_eq!(fieldnorm_reader.fieldnorm(4), 2); // some text - } - Ok(()) - } - - #[test] - fn test_sort_index_fast_field() -> crate::Result<()> { - let index = create_test_index( - Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "my_number".to_string(), - order: Order::Asc, - }), - ..Default::default() - }), - get_text_options(), - )?; - assert_eq!( - index.settings().sort_by_field.as_ref().unwrap().field, - "my_number".to_string() - ); - - let searcher = index.reader()?.searcher(); - assert_eq!(searcher.segment_readers().len(), 1); - let segment_reader = searcher.segment_reader(0); - let fast_fields = segment_reader.fast_fields(); - - let fast_field = fast_fields - .u64("my_number") - .unwrap() - .first_or_default_col(999); - assert_eq!(fast_field.get_val(0), 10u64); - assert_eq!(fast_field.get_val(1), 20u64); - assert_eq!(fast_field.get_val(2), 30u64); - - let multifield = fast_fields.u64("multi_numbers").unwrap(); - let vals: Vec = multifield.values_for_doc(0u32).collect(); - assert_eq!(vals, &[] as &[u64]); - let vals: Vec<_> = multifield.values_for_doc(1u32).collect(); - assert_eq!(vals, &[5, 6]); - - let vals: Vec<_> = multifield.values_for_doc(2u32).collect(); - assert_eq!(vals, &[3]); - Ok(()) - } - - #[test] - fn test_with_sort_by_date_field() -> crate::Result<()> { - let mut schema_builder = Schema::builder(); - let date_field = schema_builder.add_date_field("date", INDEXED | STORED | FAST); - let schema = schema_builder.build(); - - let settings = IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "date".to_string(), - order: Order::Desc, - }), - ..Default::default() - }; - - let index = Index::builder() - .schema(schema) - .settings(settings) - .create_in_ram()?; - let mut index_writer = index.writer_for_tests()?; - index_writer.set_merge_policy(Box::new(NoMergePolicy)); - - index_writer.add_document(doc!( - date_field => DateTime::from_timestamp_secs(1000), - ))?; - index_writer.add_document(doc!( - date_field => DateTime::from_timestamp_secs(999), - ))?; - index_writer.add_document(doc!( - date_field => DateTime::from_timestamp_secs(1001), - ))?; - index_writer.commit()?; - - let searcher = index.reader()?.searcher(); - assert_eq!(searcher.segment_readers().len(), 1); - let segment_reader = searcher.segment_reader(0); - let fast_fields = segment_reader.fast_fields(); - - let fast_field = fast_fields - .date("date") - .unwrap() - .first_or_default_col(DateTime::from_timestamp_secs(0)); - assert_eq!(fast_field.get_val(0), DateTime::from_timestamp_secs(1001)); - assert_eq!(fast_field.get_val(1), DateTime::from_timestamp_secs(1000)); - assert_eq!(fast_field.get_val(2), DateTime::from_timestamp_secs(999)); - Ok(()) - } - - #[test] - fn test_doc_mapping() { - let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![3, 2, 5]); - assert_eq!(doc_mapping.get_old_doc_id(0), 3); - assert_eq!(doc_mapping.get_old_doc_id(1), 2); - assert_eq!(doc_mapping.get_old_doc_id(2), 5); - assert_eq!(doc_mapping.get_new_doc_id(0), 0); - assert_eq!(doc_mapping.get_new_doc_id(1), 0); - assert_eq!(doc_mapping.get_new_doc_id(2), 1); - assert_eq!(doc_mapping.get_new_doc_id(3), 0); - assert_eq!(doc_mapping.get_new_doc_id(4), 0); - assert_eq!(doc_mapping.get_new_doc_id(5), 2); - } - - #[test] - fn test_doc_mapping_remap() { - let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![2, 8, 3]); - assert_eq!( - &doc_mapping.remap(&[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]), - &[2000, 8000, 3000] - ); - } - - #[test] - fn test_text_sort() -> crate::Result<()> { - let mut schema_builder = SchemaBuilder::new(); - schema_builder.add_text_field("id", STRING | FAST | STORED); - schema_builder.add_text_field("name", TEXT | STORED); - - let resp = IndexBuilder::new() - .schema(schema_builder.build()) - .settings(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "id".to_string(), - order: Order::Asc, - }), - ..Default::default() - }) - .create_in_ram(); - assert!(resp - .unwrap_err() - .to_string() - .contains("Unsupported field type")); - - Ok(()) - } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 657e82925e..0b62a01782 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -803,7 +803,7 @@ mod tests { use std::collections::{HashMap, HashSet}; use std::net::Ipv6Addr; - use columnar::{Cardinality, Column, MonotonicallyMappableToU128}; + use columnar::{Column, MonotonicallyMappableToU128}; use itertools::Itertools; use proptest::prop_oneof; @@ -813,15 +813,15 @@ mod tests { use crate::error::*; use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN; use crate::indexer::NoMergePolicy; - use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery}; + use crate::query::{QueryParser, TermQuery}; use crate::schema::{ - self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema, + self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED, STRING, TEXT, }; use crate::store::DOCSTORE_CACHE_CAPACITY; use crate::{ - DateTime, DocAddress, Index, IndexSettings, IndexSortByField, IndexWriter, Order, - ReloadPolicy, TantivyDocument, Term, + DateTime, DocAddress, Index, IndexSettings, IndexWriter, ReloadPolicy, TantivyDocument, + Term, }; const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \ @@ -1462,116 +1462,6 @@ mod tests { assert!(text_fast_field.term_ords(1).eq([1].into_iter())); } - #[test] - fn test_delete_with_sort_by_field() -> crate::Result<()> { - let mut schema_builder = schema::Schema::builder(); - let id_field = schema_builder.add_u64_field("id", INDEXED | schema::STORED | FAST); - let schema = schema_builder.build(); - - let settings = IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "id".to_string(), - order: Order::Desc, - }), - ..Default::default() - }; - - let index = Index::builder() - .schema(schema) - .settings(settings) - .create_in_ram()?; - let index_reader = index.reader()?; - let mut index_writer = index.writer_for_tests()?; - - // create and delete docs in same commit - for id in 0u64..5u64 { - index_writer.add_document(doc!(id_field => id))?; - } - for id in 2u64..4u64 { - index_writer.delete_term(Term::from_field_u64(id_field, id)); - } - for id in 5u64..10u64 { - index_writer.add_document(doc!(id_field => id))?; - } - index_writer.commit()?; - index_reader.reload()?; - - let searcher = index_reader.searcher(); - assert_eq!(searcher.segment_readers().len(), 1); - - let segment_reader = searcher.segment_reader(0); - assert_eq!(segment_reader.num_docs(), 8); - assert_eq!(segment_reader.max_doc(), 10); - let fast_field_reader = segment_reader.fast_fields().u64("id")?; - - let in_order_alive_ids: Vec = segment_reader - .doc_ids_alive() - .flat_map(|doc| fast_field_reader.values_for_doc(doc)) - .collect(); - assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 1, 0]); - Ok(()) - } - - #[test] - fn test_delete_query_with_sort_by_field() -> crate::Result<()> { - let mut schema_builder = schema::Schema::builder(); - let id_field = schema_builder.add_u64_field("id", INDEXED | schema::STORED | FAST); - let schema = schema_builder.build(); - - let settings = IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "id".to_string(), - order: Order::Desc, - }), - ..Default::default() - }; - - let index = Index::builder() - .schema(schema) - .settings(settings) - .create_in_ram()?; - let index_reader = index.reader()?; - let mut index_writer = index.writer_for_tests()?; - - // create and delete docs in same commit - for id in 0u64..5u64 { - index_writer.add_document(doc!(id_field => id))?; - } - for id in 1u64..4u64 { - let term = Term::from_field_u64(id_field, id); - let not_term = Term::from_field_u64(id_field, 2); - let term = Box::new(TermQuery::new(term, Default::default())); - let not_term = Box::new(TermQuery::new(not_term, Default::default())); - - let query: BooleanQuery = vec![ - (Occur::Must, term as Box), - (Occur::MustNot, not_term as Box), - ] - .into(); - - index_writer.delete_query(Box::new(query))?; - } - for id in 5u64..10u64 { - index_writer.add_document(doc!(id_field => id))?; - } - index_writer.commit()?; - index_reader.reload()?; - - let searcher = index_reader.searcher(); - assert_eq!(searcher.segment_readers().len(), 1); - - let segment_reader = searcher.segment_reader(0); - assert_eq!(segment_reader.num_docs(), 8); - assert_eq!(segment_reader.max_doc(), 10); - let fast_field_reader = segment_reader.fast_fields().u64("id")?; - let in_order_alive_ids: Vec = segment_reader - .doc_ids_alive() - .flat_map(|doc| fast_field_reader.values_for_doc(doc)) - .collect(); - assert_eq!(&in_order_alive_ids[..], &[9, 8, 7, 6, 5, 4, 2, 0]); - Ok(()) - } - #[derive(Debug, Clone)] enum IndexingOp { AddMultipleDoc { @@ -1718,11 +1608,7 @@ mod tests { id_list } - fn test_operation_strategy( - ops: &[IndexingOp], - sort_index: bool, - force_end_merge: bool, - ) -> crate::Result { + fn test_operation_strategy(ops: &[IndexingOp], force_end_merge: bool) -> crate::Result { let mut schema_builder = schema::Schema::builder(); let json_field = schema_builder.add_json_field("json", FAST | TEXT | STORED); let ip_field = schema_builder.add_ip_addr_field("ip", FAST | INDEXED | STORED); @@ -1758,15 +1644,7 @@ mod tests { ); let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let schema = schema_builder.build(); - let settings = if sort_index { - IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "id_opt".to_string(), - order: Order::Asc, - }), - ..Default::default() - } - } else { + let settings = { IndexSettings { ..Default::default() } @@ -2329,78 +2207,13 @@ mod tests { } } - // Test if index property is in sort order - if sort_index { - // load all id_opt in each segment and check they are in order - - for reader in searcher.segment_readers() { - let (ff_reader, _) = reader.fast_fields().u64_lenient("id_opt").unwrap().unwrap(); - let mut ids_in_segment: Vec = Vec::new(); - - for doc in 0..reader.num_docs() { - ids_in_segment.extend(ff_reader.values_for_doc(doc)); - } - - assert!(is_sorted(&ids_in_segment)); - - fn is_sorted(data: &[T]) -> bool - where T: Ord { - data.windows(2).all(|w| w[0] <= w[1]) - } - } - } Ok(index) } #[test] fn test_fast_field_range() { let ops: Vec<_> = (0..1000).map(|id| IndexingOp::add(id)).collect(); - assert!(test_operation_strategy(&ops, false, true).is_ok()); - } - - #[test] - fn test_sort_index_on_opt_field_regression() { - assert!(test_operation_strategy( - &[ - IndexingOp::add(81), - IndexingOp::add(70), - IndexingOp::DeleteDoc { id: 70 } - ], - true, - false - ) - .is_ok()); - } - - #[test] - fn test_simple_multiple_doc() { - assert!(test_operation_strategy( - &[ - IndexingOp::AddMultipleDoc { - id: 7, - num_docs: 800, - value: IndexValue::U64(0), - }, - IndexingOp::AddMultipleDoc { - id: 92, - num_docs: 800, - value: IndexValue::U64(0), - }, - IndexingOp::AddMultipleDoc { - id: 30, - num_docs: 800, - value: IndexValue::U64(0), - }, - IndexingOp::AddMultipleDoc { - id: 33, - num_docs: 800, - value: IndexValue::U64(0), - }, - ], - true, - false - ) - .is_ok()); + assert!(test_operation_strategy(&ops, true).is_ok()); } #[test] @@ -2414,7 +2227,6 @@ mod tests { IndexingOp::Commit, IndexingOp::Merge ], - true, false ) .is_ok()); @@ -2431,7 +2243,6 @@ mod tests { IndexingOp::add(1), IndexingOp::Commit, ], - false, true ) .is_ok()); @@ -2439,186 +2250,50 @@ mod tests { #[test] fn test_minimal_sort_force_end_merge() { - assert!(test_operation_strategy( - &[IndexingOp::add(23), IndexingOp::add(13),], - false, - false - ) - .is_ok()); - } - - #[test] - fn test_minimal_sort() { - let mut schema_builder = Schema::builder(); - let val = schema_builder.add_u64_field("val", FAST); - let id = schema_builder.add_u64_field("id", FAST); - let schema = schema_builder.build(); - let settings = IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "id".to_string(), - order: Order::Asc, - }), - ..Default::default() - }; - let index = Index::builder() - .schema(schema) - .settings(settings) - .create_in_ram() - .unwrap(); - let mut writer = index.writer_for_tests().unwrap(); - writer - .add_document(doc!(id=> 3u64, val=>4u64, val=>4u64)) - .unwrap(); - writer - .add_document(doc!(id=> 2u64, val=>2u64, val=>2u64)) - .unwrap(); - writer - .add_document(doc!(id=> 1u64, val=>1u64, val=>1u64)) - .unwrap(); - writer.commit().unwrap(); - let reader = index.reader().unwrap(); - let searcher = reader.searcher(); - let segment_reader = searcher.segment_reader(0); - let id_col: Column = segment_reader - .fast_fields() - .column_opt("id") - .unwrap() - .unwrap(); - let val_col: Column = segment_reader - .fast_fields() - .column_opt("val") - .unwrap() - .unwrap(); - assert_eq!(id_col.get_cardinality(), Cardinality::Full); - assert_eq!(val_col.get_cardinality(), Cardinality::Multivalued); - assert_eq!(id_col.first(0u32), Some(1u64)); - assert_eq!(id_col.first(1u32), Some(2u64)); - assert!(val_col.values_for_doc(0u32).eq([1u64, 1u64].into_iter())); - assert!(val_col.values_for_doc(1u32).eq([2u64, 2u64].into_iter())); - } - - #[test] - fn test_minimal_sort_force_end_merge_with_delete() { - assert!(test_operation_strategy( - &[ - IndexingOp::add(23), - IndexingOp::add(13), - IndexingOp::DeleteDoc { id: 13 } - ], - true, - true - ) - .is_ok()); + assert!( + test_operation_strategy(&[IndexingOp::add(23), IndexingOp::add(13),], false).is_ok() + ); } #[test] - fn test_minimal_no_sort_no_force_end_merge() { + fn test_minimal_no_force_end_merge() { assert!(test_operation_strategy( &[ IndexingOp::add(23), IndexingOp::add(13), IndexingOp::DeleteDoc { id: 13 } ], - false, false ) .is_ok()); } - #[test] - fn test_minimal_sort_merge() { - assert!(test_operation_strategy(&[IndexingOp::add(3),], true, true).is_ok()); - } - use proptest::prelude::*; proptest! { #![proptest_config(ProptestConfig::with_cases(20))] #[test] - fn test_delete_with_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { - assert!(test_operation_strategy(&ops[..], true, false).is_ok()); - } - - #[test] - fn test_delete_without_sort_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { - assert!(test_operation_strategy(&ops[..], false, false).is_ok()); - } - - #[test] - fn test_delete_with_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { - assert!(test_operation_strategy(&ops[..], true, true).is_ok()); - } - - #[test] - fn test_delete_without_sort_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { - assert!(test_operation_strategy(&ops[..], false, true).is_ok());} - - #[test] - fn test_delete_with_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) { - assert!(test_operation_strategy(&ops[..], true, false).is_ok()); + fn test_delete_proptest_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { + assert!(test_operation_strategy(&ops[..], false).is_ok()); } #[test] - fn test_delete_without_sort_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) { - assert!(test_operation_strategy(&ops[..], false, false).is_ok()); + fn test_delete_proptest_with_merge_adding(ops in proptest::collection::vec(adding_operation_strategy(), 1..100)) { + assert!(test_operation_strategy(&ops[..], true).is_ok()); } #[test] - fn test_delete_with_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) { - assert!(test_operation_strategy(&ops[..], true, true).is_ok()); + fn test_delete_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..10)) { + assert!(test_operation_strategy(&ops[..], false).is_ok()); } - - #[test] - fn test_delete_without_sort_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..100)) { - assert!(test_operation_strategy(&ops[..], false, true).is_ok()); + fn test_delete_proptest_with_merge(ops in proptest::collection::vec(balanced_operation_strategy(), 1..100)) { + assert!(test_operation_strategy(&ops[..], true).is_ok()); } } - #[test] - fn test_delete_with_sort_by_field_last_opstamp_is_not_max() -> crate::Result<()> { - let mut schema_builder = schema::Schema::builder(); - let sort_by_field = schema_builder.add_u64_field("sort_by", FAST); - let id_field = schema_builder.add_u64_field("id", INDEXED); - let schema = schema_builder.build(); - - let settings = IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "sort_by".to_string(), - order: Order::Asc, - }), - ..Default::default() - }; - - let index = Index::builder() - .schema(schema) - .settings(settings) - .create_in_ram()?; - let mut index_writer = index.writer_for_tests()?; - - // We add a doc... - index_writer.add_document(doc!(sort_by_field => 2u64, id_field => 0u64))?; - // And remove it. - index_writer.delete_term(Term::from_field_u64(id_field, 0u64)); - // We add another doc. - index_writer.add_document(doc!(sort_by_field=>1u64, id_field => 0u64))?; - - // The expected result is a segment with - // maxdoc = 2 - // numdoc = 1. - index_writer.commit()?; - - let searcher = index.reader()?.searcher(); - assert_eq!(searcher.segment_readers().len(), 1); - - let segment_reader = searcher.segment_reader(0); - assert_eq!(segment_reader.max_doc(), 2); - assert_eq!(segment_reader.num_docs(), 1); - Ok(()) - } - #[test] fn test_delete_bug_reproduction_ip_addr() { use IndexingOp::*; @@ -2633,7 +2308,7 @@ mod tests { IndexingOp::add(4), Commit, ]; - test_operation_strategy(&ops[..], false, true).unwrap(); + test_operation_strategy(&ops[..], true).unwrap(); } #[test] @@ -2646,7 +2321,7 @@ mod tests { Commit, Merge, ]; - test_operation_strategy(&ops[..], false, true).unwrap(); + test_operation_strategy(&ops[..], true).unwrap(); } #[test] @@ -2658,7 +2333,7 @@ mod tests { IndexingOp::add(13), Commit, ]; - test_operation_strategy(&ops[..], false, true).unwrap(); + test_operation_strategy(&ops[..], true).unwrap(); } #[test] @@ -2669,7 +2344,7 @@ mod tests { IndexingOp::add(9), IndexingOp::add(10), ]; - test_operation_strategy(&ops[..], false, false).unwrap(); + test_operation_strategy(&ops[..], false).unwrap(); } #[test] @@ -2696,7 +2371,6 @@ mod tests { IndexingOp::Commit, IndexingOp::Commit ], - false, false ) .is_ok()); @@ -2716,7 +2390,6 @@ mod tests { IndexingOp::Commit, IndexingOp::Merge, ], - false, true ) .is_ok()); diff --git a/src/indexer/merge_index_test.rs b/src/indexer/merge_index_test.rs new file mode 100644 index 0000000000..8b8dec3ae7 --- /dev/null +++ b/src/indexer/merge_index_test.rs @@ -0,0 +1,147 @@ +#[cfg(test)] +mod tests { + use crate::collector::TopDocs; + use crate::fastfield::AliveBitSet; + use crate::index::Index; + use crate::postings::Postings; + use crate::query::QueryParser; + use crate::schema::{ + self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions, + TextFieldIndexing, TextOptions, + }; + use crate::{DocAddress, DocSet, IndexSettings, IndexWriter, Term}; + + fn create_test_index(index_settings: Option) -> crate::Result { + let mut schema_builder = schema::Schema::builder(); + let int_options = NumericOptions::default() + .set_fast() + .set_stored() + .set_indexed(); + let int_field = schema_builder.add_u64_field("intval", int_options); + + let bytes_options = BytesOptions::default().set_fast().set_indexed(); + let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); + + let multi_numbers = + schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast()); + let text_field_options = TextOptions::default() + .set_indexing_options( + TextFieldIndexing::default() + .set_index_option(schema::IndexRecordOption::WithFreqsAndPositions), + ) + .set_stored(); + let text_field = schema_builder.add_text_field("text_field", text_field_options); + let schema = schema_builder.build(); + + let mut index_builder = Index::builder().schema(schema); + if let Some(settings) = index_settings { + index_builder = index_builder.settings(settings); + } + let index = index_builder.create_in_ram()?; + + { + let mut index_writer = index.writer_for_tests()?; + + // segment 1 - range 1-3 + index_writer.add_document(doc!(int_field=>1_u64))?; + index_writer.add_document( + doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")), + )?; + index_writer.add_document( + doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"), + )?; + index_writer.add_document( + doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"), + )?; + + index_writer.commit()?; + index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?; + + let in_val = 1u64; + index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?; + index_writer.commit()?; + let int_vals = [10u64, 5]; + index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1] + doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")), + )?; + index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?; + index_writer.add_document( + doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num") + )?; + + index_writer.delete_term(Term::from_field_text(text_field, "deleteme")); + index_writer.commit()?; + } + + // Merging the segments + { + let segment_ids = index.searchable_segment_ids()?; + let mut index_writer: IndexWriter = index.writer_for_tests()?; + index_writer.merge(&segment_ids).wait()?; + index_writer.wait_merging_threads()?; + } + Ok(index) + } + + #[test] + fn test_merge_index() { + let index = create_test_index(Some(IndexSettings { + ..Default::default() + })) + .unwrap(); + + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + let segment_reader = searcher.segment_readers().last().unwrap(); + + let searcher = index.reader().unwrap().searcher(); + { + let my_text_field = index.schema().get_field("text_field").unwrap(); + + let do_search = |term: &str| { + let query = QueryParser::for_index(&index, vec![my_text_field]) + .parse_query(term) + .unwrap(); + let top_docs: Vec<(f32, DocAddress)> = + searcher.search(&query, &TopDocs::with_limit(3)).unwrap(); + + top_docs.iter().map(|el| el.1.doc_id).collect::>() + }; + + assert_eq!(do_search("some"), vec![1]); + assert_eq!(do_search("blubber"), vec![3]); + assert_eq!(do_search("biggest"), vec![4]); + } + + // postings file + { + let my_text_field = index.schema().get_field("text_field").unwrap(); + let term_a = Term::from_field_text(my_text_field, "text"); + let inverted_index = segment_reader.inverted_index(my_text_field).unwrap(); + let mut postings = inverted_index + .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) + .unwrap() + .unwrap(); + assert_eq!(postings.doc_freq(), 2); + let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100); + assert_eq!( + postings.doc_freq_given_deletes( + segment_reader.alive_bitset().unwrap_or(&fallback_bitset) + ), + 2 + ); + + assert_eq!(postings.term_freq(), 1); + let mut output = vec![]; + postings.positions(&mut output); + assert_eq!(output, vec![1]); + postings.advance(); + + assert_eq!(postings.term_freq(), 2); + postings.positions(&mut output); + assert_eq!(output, vec![1, 3]); + } + } +} diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index f2955d3a3b..63ec869aa3 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,8 +1,7 @@ use std::sync::Arc; use columnar::{ - ColumnType, ColumnValues, ColumnarReader, MergeRowOrder, RowAddr, ShuffleMergeOrder, - StackMergeOrder, + ColumnType, ColumnarReader, MergeRowOrder, RowAddr, ShuffleMergeOrder, StackMergeOrder, }; use common::ReadOnlyBitSet; use itertools::Itertools; @@ -11,7 +10,7 @@ use measure_time::debug_time; use crate::directory::WritePtr; use crate::docset::{DocSet, TERMINATED}; use crate::error::DataCorruption; -use crate::fastfield::{AliveBitSet, FastFieldNotAvailableError}; +use crate::fastfield::AliveBitSet; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; use crate::index::{Segment, SegmentComponent, SegmentReader}; use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping}; @@ -20,9 +19,7 @@ use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings}; use crate::schema::{value_type_to_column_type, Field, FieldType, Schema}; use crate::store::StoreWriter; use crate::termdict::{TermMerger, TermOrdinal}; -use crate::{ - DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, SegmentOrdinal, -}; +use crate::{DocAddress, DocId, InvertedIndexReader}; /// Segment's max doc must be `< MAX_DOC_LIMIT`. /// @@ -80,7 +77,6 @@ fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate:: } pub struct IndexMerger { - index_settings: IndexSettings, schema: Schema, pub(crate) readers: Vec, max_doc: u32, @@ -116,7 +112,7 @@ fn convert_to_merge_order( ) -> MergeRowOrder { match doc_id_mapping.mapping_type() { MappingType::Stacked => MergeRowOrder::Stack(StackMergeOrder::stack(columnars)), - MappingType::StackedWithDeletes | MappingType::Shuffled => { + MappingType::StackedWithDeletes => { // RUST/LLVM is amazing. The following conversion is actually a no-op: // no allocation, no copy. let new_row_id_to_old_row_id: Vec = doc_id_mapping @@ -149,13 +145,9 @@ fn extract_fast_field_required_columns(schema: &Schema) -> Vec<(String, ColumnTy } impl IndexMerger { - pub fn open( - schema: Schema, - index_settings: IndexSettings, - segments: &[Segment], - ) -> crate::Result { + pub fn open(schema: Schema, segments: &[Segment]) -> crate::Result { let alive_bitset = segments.iter().map(|_| None).collect_vec(); - Self::open_with_custom_alive_set(schema, index_settings, segments, alive_bitset) + Self::open_with_custom_alive_set(schema, segments, alive_bitset) } // Create merge with a custom delete set. @@ -172,7 +164,6 @@ impl IndexMerger { // segments and partitions them e.g. by a value in a field. pub fn open_with_custom_alive_set( schema: Schema, - index_settings: IndexSettings, segments: &[Segment], alive_bitset_opt: Vec>, ) -> crate::Result { @@ -186,9 +177,6 @@ impl IndexMerger { } let max_doc = readers.iter().map(|reader| reader.num_docs()).sum(); - if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() { - readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?; - } // sort segments by their natural sort setting if max_doc >= MAX_DOC_LIMIT { let err_msg = format!( @@ -198,37 +186,12 @@ impl IndexMerger { return Err(crate::TantivyError::InvalidArgument(err_msg)); } Ok(IndexMerger { - index_settings, schema, readers, max_doc, }) } - fn sort_readers_by_min_sort_field( - readers: Vec, - sort_by_field: &IndexSortByField, - ) -> crate::Result> { - // presort the readers by their min_values, so that when they are disjunct, we can use - // the regular merge logic (implicitly sorted) - let mut readers_with_min_sort_values = readers - .into_iter() - .map(|reader| { - let accessor = Self::get_sort_field_accessor(&reader, sort_by_field)?; - Ok((reader, accessor.min_value())) - }) - .collect::>>()?; - if sort_by_field.order.is_asc() { - readers_with_min_sort_values.sort_by_key(|(_, min_val)| *min_val); - } else { - readers_with_min_sort_values.sort_by_key(|(_, min_val)| std::cmp::Reverse(*min_val)); - } - Ok(readers_with_min_sort_values - .into_iter() - .map(|(reader, _)| reader) - .collect()) - } - fn write_fieldnorms( &self, mut fieldnorms_serializer: FieldNormsSerializer, @@ -276,128 +239,6 @@ impl IndexMerger { Ok(()) } - /// Checks if the readers are disjunct for their sort property and in the correct order to be - /// able to just stack them. - pub(crate) fn is_disjunct_and_sorted_on_sort_property( - &self, - sort_by_field: &IndexSortByField, - ) -> crate::Result { - let reader_ordinal_and_field_accessors = - self.get_reader_with_sort_field_accessor(sort_by_field)?; - - let everything_is_in_order = reader_ordinal_and_field_accessors - .into_iter() - .map(|(_, col)| Arc::new(col)) - .tuple_windows() - .all(|(field_accessor1, field_accessor2)| { - if sort_by_field.order.is_asc() { - field_accessor1.max_value() <= field_accessor2.min_value() - } else { - field_accessor1.min_value() >= field_accessor2.max_value() - } - }); - Ok(everything_is_in_order) - } - - pub(crate) fn get_sort_field_accessor( - reader: &SegmentReader, - sort_by_field: &IndexSortByField, - ) -> crate::Result> { - reader.schema().get_field(&sort_by_field.field)?; - let (value_accessor, _column_type) = reader - .fast_fields() - .u64_lenient(&sort_by_field.field)? - .ok_or_else(|| FastFieldNotAvailableError { - field_name: sort_by_field.field.to_string(), - })?; - Ok(value_accessor.first_or_default_col(0u64)) - } - /// Collecting value_accessors into a vec to bind the lifetime. - pub(crate) fn get_reader_with_sort_field_accessor( - &self, - sort_by_field: &IndexSortByField, - ) -> crate::Result)>> { - let reader_ordinal_and_field_accessors = self - .readers - .iter() - .enumerate() - .map(|(reader_ordinal, _)| reader_ordinal as SegmentOrdinal) - .map(|reader_ordinal: SegmentOrdinal| { - let value_accessor = Self::get_sort_field_accessor( - &self.readers[reader_ordinal as usize], - sort_by_field, - )?; - Ok((reader_ordinal, value_accessor)) - }) - .collect::>>()?; - Ok(reader_ordinal_and_field_accessors) - } - - /// Generates the doc_id mapping where position in the vec=new - /// doc_id. - /// ReaderWithOrdinal will include the ordinal position of the - /// reader in self.readers. - pub(crate) fn generate_doc_id_mapping_with_sort_by_field( - &self, - sort_by_field: &IndexSortByField, - ) -> crate::Result { - let reader_ordinal_and_field_accessors = - self.get_reader_with_sort_field_accessor(sort_by_field)?; - // Loading the field accessor on demand causes a 15x regression - - // create iterators over segment/sort_accessor/doc_id tuple - let doc_id_reader_pair = - reader_ordinal_and_field_accessors - .iter() - .map(|(reader_ord, ff_reader)| { - let reader = &self.readers[*reader_ord as usize]; - reader - .doc_ids_alive() - .map(move |doc_id| (doc_id, reader_ord, ff_reader)) - }); - - let total_num_new_docs = self - .readers - .iter() - .map(|reader| reader.num_docs() as usize) - .sum(); - - let mut sorted_doc_ids: Vec = Vec::with_capacity(total_num_new_docs); - - // create iterator tuple of (old doc_id, reader) in order of the new doc_ids - sorted_doc_ids.extend( - doc_id_reader_pair - .into_iter() - .kmerge_by(|a, b| { - let val1 = a.2.get_val(a.0); - let val2 = b.2.get_val(b.0); - if sort_by_field.order == Order::Asc { - val1 < val2 - } else { - val1 > val2 - } - }) - .map(|(doc_id, &segment_ord, _)| DocAddress { - doc_id, - segment_ord, - }), - ); - - let alive_bitsets: Vec> = self - .readers - .iter() - .map(|segment_reader| { - let alive_bitset = segment_reader.alive_bitset()?; - Some(alive_bitset.bitset().clone()) - }) - .collect(); - Ok(SegmentDocIdMapping::new( - sorted_doc_ids, - MappingType::Shuffled, - alive_bitsets, - )) - } - /// Creates a mapping if the segments are stacked. this is helpful to merge codelines between /// index sorting and the others pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result { @@ -515,7 +356,6 @@ impl IndexMerger { ); let mut segment_postings_containing_the_term: Vec<(usize, SegmentPostings)> = vec![]; - let mut doc_id_and_positions = vec![]; while merged_terms.advance() { segment_postings_containing_the_term.clear(); @@ -611,37 +451,13 @@ impl IndexMerger { 0u32 }; - // if doc_id_mapping exists, the doc_ids are reordered, they are - // not just stacked. The field serializer expects monotonically increasing - // doc_ids, so we collect and sort them first, before writing. - // - // I think this is not strictly necessary, it would be possible to - // avoid the loading into a vec via some form of kmerge, but then the merge - // logic would deviate much more from the stacking case (unsorted index) - if !doc_id_mapping.is_trivial() { - doc_id_and_positions.push(( - remapped_doc_id, - term_freq, - positions_buffer.to_vec(), - )); - } else { - let delta_positions = delta_computer.compute_delta(&positions_buffer); - field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions); - } + let delta_positions = delta_computer.compute_delta(&positions_buffer); + field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions); } doc = segment_postings.advance(); } } - if !doc_id_mapping.is_trivial() { - doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _, _)| doc_id); - - for (doc_id, term_freq, positions) in &doc_id_and_positions { - let delta_positions = delta_computer.compute_delta(positions); - field_serializer.write_doc(*doc_id, *term_freq, delta_positions); - } - doc_id_and_positions.clear(); - } // closing the term. field_serializer.close_term()?; } @@ -670,47 +486,13 @@ impl IndexMerger { Ok(()) } - fn write_storable_fields( - &self, - store_writer: &mut StoreWriter, - doc_id_mapping: &SegmentDocIdMapping, - ) -> crate::Result<()> { + fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> crate::Result<()> { debug_time!("write-storable-fields"); debug!("write-storable-field"); - if !doc_id_mapping.is_trivial() { - debug!("non-trivial-doc-id-mapping"); - - let store_readers: Vec<_> = self - .readers - .iter() - .map(|reader| reader.get_store_reader(50)) - .collect::>()?; - - let mut document_iterators: Vec<_> = store_readers - .iter() - .enumerate() - .map(|(i, store)| store.iter_raw(self.readers[i].alive_bitset())) - .collect(); - - for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() { - let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize]; - if let Some(doc_bytes_res) = doc_bytes_it.next() { - let doc_bytes = doc_bytes_res?; - store_writer.store_bytes(&doc_bytes)?; - } else { - return Err(DataCorruption::comment_only(format!( - "unexpected missing document in docstore on merge, doc address \ - {old_doc_addr:?}", - )) - .into()); - } - } - } else { - debug!("trivial-doc-id-mapping"); - for reader in &self.readers { - let store_reader = reader.get_store_reader(1)?; - if reader.has_deletes() + for reader in &self.readers { + let store_reader = reader.get_store_reader(1)?; + if reader.has_deletes() // If there is not enough data in the store, we avoid stacking in order to // avoid creating many small blocks in the doc store. Once we have 5 full blocks, // we start stacking. In the worst case 2/7 of the blocks would be very small. @@ -726,14 +508,13 @@ impl IndexMerger { // take 7 in order to not walk over all checkpoints. || store_reader.block_checkpoints().take(7).count() < 6 || store_reader.decompressor() != store_writer.compressor().into() - { - for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) { - let doc_bytes = doc_bytes_res?; - store_writer.store_bytes(&doc_bytes)?; - } - } else { - store_writer.stack(store_reader)?; + { + for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) { + let doc_bytes = doc_bytes_res?; + store_writer.store_bytes(&doc_bytes)?; } + } else { + store_writer.stack(store_reader)?; } } Ok(()) @@ -745,18 +526,7 @@ impl IndexMerger { /// # Returns /// The number of documents in the resulting segment. pub fn write(&self, mut serializer: SegmentSerializer) -> crate::Result { - let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref() - { - // If the documents are already sorted and stackable, we ignore the mapping and execute - // it as if there was no sorting - if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? { - self.get_doc_id_from_concatenated_data()? - } else { - self.generate_doc_id_mapping_with_sort_by_field(sort_by_field)? - } - } else { - self.get_doc_id_from_concatenated_data()? - }; + let doc_id_mapping = self.get_doc_id_from_concatenated_data()?; debug!("write-fieldnorms"); if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() { self.write_fieldnorms(fieldnorms_serializer, &doc_id_mapping)?; @@ -773,7 +543,7 @@ impl IndexMerger { )?; debug!("write-storagefields"); - self.write_storable_fields(serializer.get_store_writer(), &doc_id_mapping)?; + self.write_storable_fields(serializer.get_store_writer())?; debug!("write-fastfields"); self.write_fast_fields(serializer.get_fast_field_write(), doc_id_mapping)?; @@ -805,7 +575,7 @@ mod tests { use crate::time::OffsetDateTime; use crate::{ assert_nearly_equals, schema, DateTime, DocAddress, DocId, DocSet, IndexSettings, - IndexSortByField, IndexWriter, Order, Searcher, + IndexWriter, Searcher, }; #[test] @@ -1278,60 +1048,6 @@ mod tests { test_merge_facets(None, true) } - #[test] - fn test_merge_facets_sort_asc() { - // In the merge case this will go through the doc_id mapping code - test_merge_facets( - Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "intval".to_string(), - order: Order::Desc, - }), - ..Default::default() - }), - true, - ); - // In the merge case this will not go through the doc_id mapping code, because the data - // sorted and disjunct - test_merge_facets( - Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "intval".to_string(), - order: Order::Desc, - }), - ..Default::default() - }), - false, - ); - } - - #[test] - fn test_merge_facets_sort_desc() { - // In the merge case this will go through the doc_id mapping code - test_merge_facets( - Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "intval".to_string(), - order: Order::Desc, - }), - ..Default::default() - }), - true, - ); - // In the merge case this will not go through the doc_id mapping code, because the data - // sorted and disjunct - test_merge_facets( - Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "intval".to_string(), - order: Order::Desc, - }), - ..Default::default() - }), - false, - ); - } - // force_segment_value_overlap forces the int value for sorting to have overlapping min and max // ranges between segments so that merge algorithm can't apply certain optimizations fn test_merge_facets(index_settings: Option, force_segment_value_overlap: bool) { diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs deleted file mode 100644 index a23dec40b8..0000000000 --- a/src/indexer/merger_sorted_index_test.rs +++ /dev/null @@ -1,579 +0,0 @@ -#[cfg(test)] -mod tests { - use crate::collector::TopDocs; - use crate::fastfield::AliveBitSet; - use crate::index::Index; - use crate::postings::Postings; - use crate::query::QueryParser; - use crate::schema::{ - self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions, - TextFieldIndexing, TextOptions, Value, - }; - use crate::{ - DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument, - Term, - }; - - fn create_test_index_posting_list_issue(index_settings: Option) -> Index { - let mut schema_builder = schema::Schema::builder(); - let int_options = NumericOptions::default().set_fast().set_indexed(); - let int_field = schema_builder.add_u64_field("intval", int_options); - - let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); - - let schema = schema_builder.build(); - - let mut index_builder = Index::builder().schema(schema); - if let Some(settings) = index_settings { - index_builder = index_builder.settings(settings); - } - let index = index_builder.create_in_ram().unwrap(); - - { - let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); - index_writer - .add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime"))) - .unwrap(); - index_writer - .add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime"))) - .unwrap(); - index_writer.commit().unwrap(); - index_writer - .add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta"))) - .unwrap(); - index_writer.commit().unwrap(); - } - - // Merging the segments - { - let segment_ids = index - .searchable_segment_ids() - .expect("Searchable segments failed."); - let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); - assert!(index_writer.merge(&segment_ids).wait().is_ok()); - assert!(index_writer.wait_merging_threads().is_ok()); - } - index - } - - // force_disjunct_segment_sort_values forces the field, by which the index is sorted have - // disjunct ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500] - fn create_test_index( - index_settings: Option, - force_disjunct_segment_sort_values: bool, - ) -> crate::Result { - let mut schema_builder = schema::Schema::builder(); - let int_options = NumericOptions::default() - .set_fast() - .set_stored() - .set_indexed(); - let int_field = schema_builder.add_u64_field("intval", int_options); - - let bytes_options = BytesOptions::default().set_fast().set_indexed(); - let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options); - let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); - - let multi_numbers = - schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast()); - let text_field_options = TextOptions::default() - .set_indexing_options( - TextFieldIndexing::default() - .set_index_option(schema::IndexRecordOption::WithFreqsAndPositions), - ) - .set_stored(); - let text_field = schema_builder.add_text_field("text_field", text_field_options); - let schema = schema_builder.build(); - - let mut index_builder = Index::builder().schema(schema); - if let Some(settings) = index_settings { - index_builder = index_builder.settings(settings); - } - let index = index_builder.create_in_ram()?; - - { - let mut index_writer = index.writer_for_tests()?; - - // segment 1 - range 1-3 - index_writer.add_document(doc!(int_field=>1_u64))?; - index_writer.add_document( - doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")), - )?; - index_writer.add_document( - doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"), - )?; - index_writer.add_document( - doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"), - )?; - - index_writer.commit()?; - // segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20 - index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?; - - let in_val = if force_disjunct_segment_sort_values { - 10_u64 - } else { - 1 - }; - index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?; - index_writer.commit()?; - // segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000 - let int_vals = if force_disjunct_segment_sort_values { - [100_u64, 50] - } else { - [10, 5] - }; - index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1] - doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")), - )?; - index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?; - index_writer.add_document( - doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num") - )?; - - index_writer.delete_term(Term::from_field_text(text_field, "deleteme")); - index_writer.commit()?; - } - - // Merging the segments - { - let segment_ids = index.searchable_segment_ids()?; - let mut index_writer: IndexWriter = index.writer_for_tests()?; - index_writer.merge(&segment_ids).wait()?; - index_writer.wait_merging_threads()?; - } - Ok(index) - } - - #[test] - fn test_merge_sorted_postinglist_sort_issue() { - create_test_index_posting_list_issue(Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "intval".to_string(), - order: Order::Desc, - }), - ..Default::default() - })); - } - - #[test] - fn test_merge_sorted_index_desc_not_disjunct() { - test_merge_sorted_index_desc_(false); - } - - #[test] - fn test_merge_sorted_index_desc_disjunct() { - test_merge_sorted_index_desc_(true); - } - - fn test_merge_sorted_index_desc_(force_disjunct_segment_sort_values: bool) { - let index = create_test_index( - Some(IndexSettings { - sort_by_field: Some(IndexSortByField { - field: "intval".to_string(), - order: Order::Desc, - }), - ..Default::default() - }), - force_disjunct_segment_sort_values, - ) - .unwrap(); - - let int_field = index.schema().get_field("intval").unwrap(); - let reader = index.reader().unwrap(); - - let searcher = reader.searcher(); - assert_eq!(searcher.segment_readers().len(), 1); - let segment_reader = searcher.segment_readers().last().unwrap(); - - let fast_fields = segment_reader.fast_fields(); - let fast_field = fast_fields.u64("intval").unwrap(); - assert_eq!(fast_field.first(5), Some(1u64)); - assert_eq!(fast_field.first(4), Some(2u64)); - assert_eq!(fast_field.first(3), Some(3u64)); - if force_disjunct_segment_sort_values { - assert_eq!(fast_field.first(2), Some(20u64)); - assert_eq!(fast_field.first(1), Some(100u64)); - } else { - assert_eq!(fast_field.first(2), Some(10u64)); - assert_eq!(fast_field.first(1), Some(20u64)); - } - assert_eq!(fast_field.first(0), Some(1_000u64)); - - // test new field norm mapping - { - let my_text_field = index.schema().get_field("text_field").unwrap(); - let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap(); - assert_eq!(fieldnorm_reader.fieldnorm(0), 3); // the biggest num - if force_disjunct_segment_sort_values { - assert_eq!(fieldnorm_reader.fieldnorm(1), 1); // blubber - assert_eq!(fieldnorm_reader.fieldnorm(2), 0); - } else { - assert_eq!(fieldnorm_reader.fieldnorm(1), 0); - assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber - } - assert_eq!(fieldnorm_reader.fieldnorm(3), 2); // some text - assert_eq!(fieldnorm_reader.fieldnorm(5), 0); - } - - let my_text_field = index.schema().get_field("text_field").unwrap(); - let searcher = index.reader().unwrap().searcher(); - { - let my_text_field = index.schema().get_field("text_field").unwrap(); - - let do_search = |term: &str| { - let query = QueryParser::for_index(&index, vec![my_text_field]) - .parse_query(term) - .unwrap(); - let top_docs: Vec<(f32, DocAddress)> = - searcher.search(&query, &TopDocs::with_limit(3)).unwrap(); - - top_docs.iter().map(|el| el.1.doc_id).collect::>() - }; - - assert_eq!(do_search("some"), vec![3]); - if force_disjunct_segment_sort_values { - assert_eq!(do_search("blubber"), vec![1]); - } else { - assert_eq!(do_search("blubber"), vec![2]); - } - assert_eq!(do_search("biggest"), vec![0]); - } - - // postings file - { - let my_text_field = index.schema().get_field("text_field").unwrap(); - let term_a = Term::from_field_text(my_text_field, "text"); - let inverted_index = segment_reader.inverted_index(my_text_field).unwrap(); - let mut postings = inverted_index - .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) - .unwrap() - .unwrap(); - - assert_eq!(postings.doc_freq(), 2); - let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100); - assert_eq!( - postings.doc_freq_given_deletes( - segment_reader.alive_bitset().unwrap_or(&fallback_bitset) - ), - 2 - ); - - assert_eq!(postings.term_freq(), 1); - let mut output = vec![]; - postings.positions(&mut output); - assert_eq!(output, vec![1]); - postings.advance(); - - assert_eq!(postings.term_freq(), 2); - postings.positions(&mut output); - assert_eq!(output, vec![1, 3]); - } - - // access doc store - { - let blubber_pos = if force_disjunct_segment_sort_values { - 1 - } else { - 2 - }; - let doc = searcher - .doc::(DocAddress::new(0, blubber_pos)) - .unwrap(); - assert_eq!( - doc.get_first(my_text_field).unwrap().as_value().as_str(), - Some("blubber") - ); - let doc = searcher - .doc::(DocAddress::new(0, 0)) - .unwrap(); - assert_eq!( - doc.get_first(int_field).unwrap().as_value().as_u64(), - Some(1000) - ); - } - } - - #[test] - fn test_merge_unsorted_index() { - let index = create_test_index( - Some(IndexSettings { - ..Default::default() - }), - false, - ) - .unwrap(); - - let reader = index.reader().unwrap(); - let searcher = reader.searcher(); - assert_eq!(searcher.segment_readers().len(), 1); - let segment_reader = searcher.segment_readers().last().unwrap(); - - let searcher = index.reader().unwrap().searcher(); - { - let my_text_field = index.schema().get_field("text_field").unwrap(); - - let do_search = |term: &str| { - let query = QueryParser::for_index(&index, vec![my_text_field]) - .parse_query(term) - .unwrap(); - let top_docs: Vec<(f32, DocAddress)> = - searcher.search(&query, &TopDocs::with_limit(3)).unwrap(); - - top_docs.iter().map(|el| el.1.doc_id).collect::>() - }; - - assert_eq!(do_search("some"), vec![1]); - assert_eq!(do_search("blubber"), vec![3]); - assert_eq!(do_search("biggest"), vec![4]); - } - - // postings file - { - let my_text_field = index.schema().get_field("text_field").unwrap(); - let term_a = Term::from_field_text(my_text_field, "text"); - let inverted_index = segment_reader.inverted_index(my_text_field).unwrap(); - let mut postings = inverted_index - .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) - .unwrap() - .unwrap(); - assert_eq!(postings.doc_freq(), 2); - let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100); - assert_eq!( - postings.doc_freq_given_deletes( - segment_reader.alive_bitset().unwrap_or(&fallback_bitset) - ), - 2 - ); - - assert_eq!(postings.term_freq(), 1); - let mut output = vec![]; - postings.positions(&mut output); - assert_eq!(output, vec![1]); - postings.advance(); - - assert_eq!(postings.term_freq(), 2); - postings.positions(&mut output); - assert_eq!(output, vec![1, 3]); - } - } - - // #[test] - // fn test_merge_sorted_index_asc() { - // let index = create_test_index( - // Some(IndexSettings { - // sort_by_field: Some(IndexSortByField { - // field: "intval".to_string(), - // order: Order::Asc, - // }), - // ..Default::default() - // }), - // false, - // ) - // .unwrap(); - - // let int_field = index.schema().get_field("intval").unwrap(); - // let multi_numbers = index.schema().get_field("multi_numbers").unwrap(); - // let bytes_field = index.schema().get_field("bytes").unwrap(); - // let reader = index.reader().unwrap(); - // let searcher = reader.searcher(); - // assert_eq!(searcher.segment_readers().len(), 1); - // let segment_reader = searcher.segment_readers().last().unwrap(); - - // let fast_fields = segment_reader.fast_fields(); - // let fast_field = fast_fields.u64(int_field).unwrap(); - // assert_eq!(fast_field.get_val(0), 1u64); - // assert_eq!(fast_field.get_val(1), 2u64); - // assert_eq!(fast_field.get_val(2), 3u64); - // assert_eq!(fast_field.get_val(3), 10u64); - // assert_eq!(fast_field.get_val(4), 20u64); - // assert_eq!(fast_field.get_val(5), 1_000u64); - - // let get_vals = |fast_field: &MultiValuedFastFieldReader, doc_id: u32| -> Vec { - // let mut vals = vec![]; - // fast_field.get_vals(doc_id, &mut vals); - // vals - // }; - // let fast_fields = segment_reader.fast_fields(); - // let fast_field = fast_fields.u64s(multi_numbers).unwrap(); - // assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]); - // assert_eq!(&get_vals(&fast_field, 1), &[2, 3]); - // assert_eq!(&get_vals(&fast_field, 2), &[3, 4]); - // assert_eq!(&get_vals(&fast_field, 3), &[10, 11]); - // assert_eq!(&get_vals(&fast_field, 4), &[20]); - // assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]); - - // let fast_field = fast_fields.bytes(bytes_field).unwrap(); - // assert_eq!(fast_field.get_bytes(0), &[] as &[u8]); - // assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]); - // assert_eq!(fast_field.get_bytes(5), &[5, 5]); - - // // test new field norm mapping - // { - // let my_text_field = index.schema().get_field("text_field").unwrap(); - // let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap(); - // assert_eq!(fieldnorm_reader.fieldnorm(0), 0); - // assert_eq!(fieldnorm_reader.fieldnorm(1), 4); - // assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text - // assert_eq!(fieldnorm_reader.fieldnorm(3), 1); - // assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num - // } - - // let searcher = index.reader().unwrap().searcher(); - // { - // let my_text_field = index.schema().get_field("text_field").unwrap(); - - // let do_search = |term: &str| { - // let query = QueryParser::for_index(&index, vec![my_text_field]) - // .parse_query(term) - // .unwrap(); - // let top_docs: Vec<(f32, DocAddress)> = - // searcher.search(&query, &TopDocs::with_limit(3)).unwrap(); - - // top_docs.iter().map(|el| el.1.doc_id).collect::>() - // }; - - // assert_eq!(do_search("some"), vec![2]); - // assert_eq!(do_search("blubber"), vec![3]); - // assert_eq!(do_search("biggest"), vec![5]); - // } - - // // postings file - // { - // let my_text_field = index.schema().get_field("text_field").unwrap(); - // let term_a = Term::from_field_text(my_text_field, "text"); - // let inverted_index = segment_reader.inverted_index(my_text_field).unwrap(); - // let mut postings = inverted_index - // .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions) - // .unwrap() - // .unwrap(); - - // assert_eq!(postings.doc_freq(), 2); - // let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100); - // assert_eq!( - // postings.doc_freq_given_deletes( - // segment_reader.alive_bitset().unwrap_or(&fallback_bitset) - // ), - // 2 - // ); - - // let mut output = vec![]; - // postings.positions(&mut output); - // assert_eq!(output, vec![1, 3]); - // postings.advance(); - - // postings.positions(&mut output); - // assert_eq!(output, vec![1]); - // } - - // // access doc store - // { - // let doc = searcher.doc(DocAddress::new(0, 0)).unwrap(); - // assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1)); - // let doc = searcher.doc(DocAddress::new(0, 1)).unwrap(); - // assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2)); - // let doc = searcher.doc(DocAddress::new(0, 2)).unwrap(); - // assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3)); - // let doc = searcher.doc(DocAddress::new(0, 3)).unwrap(); - // assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10)); - // let doc = searcher.doc(DocAddress::new(0, 4)).unwrap(); - // assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20)); - // let doc = searcher.doc(DocAddress::new(0, 5)).unwrap(); - // assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000)); - // } - // } -} - -#[cfg(all(test, feature = "unstable"))] -mod bench_sorted_index_merge { - - use test::{self, Bencher}; - - use crate::index::Index; - use crate::indexer::merger::IndexMerger; - use crate::schema::{NumericOptions, Schema}; - use crate::{IndexSettings, IndexSortByField, IndexWriter, Order}; - fn create_index(sort_by_field: Option) -> Index { - let mut schema_builder = Schema::builder(); - let int_options = NumericOptions::default().set_fast().set_indexed(); - let int_field = schema_builder.add_u64_field("intval", int_options); - let schema = schema_builder.build(); - - let index_builder = Index::builder().schema(schema).settings(IndexSettings { - sort_by_field, - ..Default::default() - }); - let index = index_builder.create_in_ram().unwrap(); - - { - let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); - let index_doc = |index_writer: &mut IndexWriter, val: u64| { - index_writer.add_document(doc!(int_field=>val)).unwrap(); - }; - // 3 segments with 10_000 values in the fast fields - for _ in 0..3 { - index_doc(&mut index_writer, 5000); // fix to make it unordered - for i in 0..10_000 { - index_doc(&mut index_writer, i); - } - index_writer.commit().unwrap(); - } - } - index - } - - //#[bench] - // fn create_sorted_index_walk_overkmerge_on_merge_fastfield( - // b: &mut Bencher, - //) -> crate::Result<()> { - // let sort_by_field = IndexSortByField { - // field: "intval".to_string(), - // order: Order::Desc, - //}; - // let index = create_index(Some(sort_by_field.clone())); - // let segments = index.searchable_segments().unwrap(); - // let merger: IndexMerger = - // IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?; - // let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap(); - // b.iter(|| { - // let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| { - // let reader = &merger.readers[doc_addr.segment_ord as usize]; - // let u64_reader: Arc> = reader - //.fast_fields() - //.typed_fast_field_reader("intval") - //.expect( - //"Failed to find a reader for single fast field. This is a tantivy bug and \ - // it should never happen.", - //); - //(doc_addr.doc_id, reader, u64_reader) - //}); - /// add values in order of the new doc_ids - // let mut val = 0; - // for (doc_id, _reader, field_reader) in sorted_doc_ids { - // val = field_reader.get_val(doc_id); - //} - - // val - //}); - - // Ok(()) - //} - #[bench] - fn create_sorted_index_create_doc_id_mapping(b: &mut Bencher) -> crate::Result<()> { - let sort_by_field = IndexSortByField { - field: "intval".to_string(), - order: Order::Desc, - }; - let index = create_index(Some(sort_by_field.clone())); - // let field = index.schema().get_field("intval").unwrap(); - let segments = index.searchable_segments().unwrap(); - let merger: IndexMerger = - IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?; - b.iter(|| { - merger - .generate_doc_id_mapping_with_sort_by_field(&sort_by_field) - .unwrap(); - }); - - Ok(()) - } -} diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 9a062a7400..b3dfb2a6f8 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -13,10 +13,10 @@ mod flat_map_with_buffer; pub(crate) mod index_writer; pub(crate) mod index_writer_status; mod log_merge_policy; +mod merge_index_test; mod merge_operation; pub(crate) mod merge_policy; pub(crate) mod merger; -mod merger_sorted_index_test; pub(crate) mod operation; pub(crate) mod prepared_commit; mod segment_entry; diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs index cdb7a8ef50..057a51b10c 100644 --- a/src/indexer/segment_serializer.rs +++ b/src/indexer/segment_serializer.rs @@ -18,27 +18,9 @@ pub struct SegmentSerializer { impl SegmentSerializer { /// Creates a new `SegmentSerializer`. - pub fn for_segment( - mut segment: Segment, - is_in_merge: bool, - ) -> crate::Result { - // If the segment is going to be sorted, we stream the docs first to a temporary file. - // In the merge case this is not necessary because we can kmerge the already sorted - // segments - let remapping_required = segment.index().settings().sort_by_field.is_some() && !is_in_merge; + pub fn for_segment(mut segment: Segment) -> crate::Result { let settings = segment.index().settings().clone(); - let store_writer = if remapping_required { - let store_write = segment.open_write(SegmentComponent::TempStore)?; - StoreWriter::new( - store_write, - crate::store::Compressor::None, - // We want fast random access on the docs, so we choose a small block size. - // If this is zero, the skip index will contain too many checkpoints and - // therefore will be relatively slow. - 16000, - settings.docstore_compress_dedicated_thread, - )? - } else { + let store_writer = { let store_write = segment.open_write(SegmentComponent::Store)?; StoreWriter::new( store_write, @@ -72,10 +54,6 @@ impl SegmentSerializer { &self.segment } - pub fn segment_mut(&mut self) -> &mut Segment { - &mut self.segment - } - /// Accessor to the `PostingsSerializer`. pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer { &mut self.postings_serializer diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 12faba9512..87eaf576ff 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -115,11 +115,10 @@ fn merge( .collect(); // An IndexMerger is like a "view" of our merged segments. - let merger: IndexMerger = - IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?; + let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?; // ... we just serialize this index merger in our new segment to merge the segments. - let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone(), true)?; + let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?; let num_docs = merger.write(segment_serializer)?; @@ -220,13 +219,9 @@ pub fn merge_filtered_segments>>( )?; let merged_segment = merged_index.new_segment(); let merged_segment_id = merged_segment.id(); - let merger: IndexMerger = IndexMerger::open_with_custom_alive_set( - merged_index.schema(), - merged_index.settings().clone(), - segments, - filter_doc_ids, - )?; - let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?; + let merger: IndexMerger = + IndexMerger::open_with_custom_alive_set(merged_index.schema(), segments, filter_doc_ids)?; + let segment_serializer = SegmentSerializer::for_segment(merged_segment)?; let num_docs = merger.write(segment_serializer)?; let segment_meta = merged_index.new_segment_meta(merged_segment_id, num_docs); @@ -1067,7 +1062,6 @@ mod tests { )?; let merger: IndexMerger = IndexMerger::open_with_custom_alive_set( merged_index.schema(), - merged_index.settings().clone(), &segments[..], filter_segments, )?; @@ -1083,7 +1077,6 @@ mod tests { Index::create(RamDirectory::default(), target_schema, target_settings)?; let merger: IndexMerger = IndexMerger::open_with_custom_alive_set( merged_index.schema(), - merged_index.settings().clone(), &segments[..], filter_segments, )?; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index fabd72d3a6..439f46aee8 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -3,7 +3,6 @@ use common::JsonPathWriter; use itertools::Itertools; use tokenizer_api::BoxTokenStream; -use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping}; use super::operation::AddOperation; use crate::fastfield::FastFieldsWriter; use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter}; @@ -16,7 +15,6 @@ use crate::postings::{ }; use crate::schema::document::{Document, Value}; use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED}; -use crate::store::{StoreReader, StoreWriter}; use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer}; use crate::{DocId, Opstamp, TantivyError}; @@ -41,20 +39,6 @@ fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result< }) } -fn remap_doc_opstamps( - opstamps: Vec, - doc_id_mapping_opt: Option<&DocIdMapping>, -) -> Vec { - if let Some(doc_id_mapping_opt) = doc_id_mapping_opt { - doc_id_mapping_opt - .iter_old_doc_ids() - .map(|doc| opstamps[doc as usize]) - .collect() - } else { - opstamps - } -} - /// A `SegmentWriter` is in charge of creating segment index from a /// set of documents. /// @@ -90,7 +74,7 @@ impl SegmentWriter { let tokenizer_manager = segment.index().tokenizers().clone(); let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone(); let table_size = compute_initial_table_size(memory_budget_in_bytes)?; - let segment_serializer = SegmentSerializer::for_segment(segment, false)?; + let segment_serializer = SegmentSerializer::for_segment(segment)?; let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema); let per_field_text_analyzers = schema .fields() @@ -139,15 +123,6 @@ impl SegmentWriter { /// be used afterwards. pub fn finalize(mut self) -> crate::Result> { self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc); - let mapping: Option = self - .segment_serializer - .segment() - .index() - .settings() - .sort_by_field - .clone() - .map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self)) - .transpose()?; remap_and_write( self.schema, &self.per_field_postings_writers, @@ -155,10 +130,8 @@ impl SegmentWriter { self.fast_field_writers, &self.fieldnorms_writer, self.segment_serializer, - mapping.as_ref(), )?; - let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping.as_ref()); - Ok(doc_opstamps) + Ok(self.doc_opstamps) } /// Returns an estimation of the current memory usage of the segment writer. @@ -419,11 +392,10 @@ fn remap_and_write( fast_field_writers: FastFieldsWriter, fieldnorms_writer: &FieldNormsWriter, mut serializer: SegmentSerializer, - doc_id_map: Option<&DocIdMapping>, ) -> crate::Result<()> { debug!("remap-and-write"); if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() { - fieldnorms_writer.serialize(fieldnorms_serializer, doc_id_map)?; + fieldnorms_writer.serialize(fieldnorms_serializer)?; } let fieldnorm_data = serializer .segment() @@ -434,39 +406,10 @@ fn remap_and_write( schema, per_field_postings_writers, fieldnorm_readers, - doc_id_map, serializer.get_postings_serializer(), )?; debug!("fastfield-serialize"); - fast_field_writers.serialize(serializer.get_fast_field_write(), doc_id_map)?; - - // finalize temp docstore and create version, which reflects the doc_id_map - if let Some(doc_id_map) = doc_id_map { - debug!("resort-docstore"); - let store_write = serializer - .segment_mut() - .open_write(SegmentComponent::Store)?; - let settings = serializer.segment().index().settings(); - let store_writer = StoreWriter::new( - store_write, - settings.docstore_compression, - settings.docstore_blocksize, - settings.docstore_compress_dedicated_thread, - )?; - let old_store_writer = std::mem::replace(&mut serializer.store_writer, store_writer); - old_store_writer.close()?; - let store_read = StoreReader::open( - serializer - .segment() - .open_read(SegmentComponent::TempStore)?, - 1, /* The docstore is configured to have one doc per block, and each doc is accessed - * only once: we don't need caching. */ - )?; - for old_doc_id in doc_id_map.iter_old_doc_ids() { - let doc_bytes = store_read.get_document_bytes(old_doc_id)?; - serializer.get_store_writer().store_bytes(&doc_bytes)?; - } - } + fast_field_writers.serialize(serializer.get_fast_field_write())?; debug!("serializer-close"); serializer.close()?; diff --git a/src/lib.rs b/src/lib.rs index ed6463c222..6ad2981c1a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -222,8 +222,8 @@ pub use crate::core::{Executor, Searcher, SearcherGeneration}; pub use crate::directory::Directory; #[allow(deprecated)] // Remove with index sorting pub use crate::index::{ - Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order, - Segment, SegmentMeta, SegmentReader, + Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment, + SegmentMeta, SegmentReader, }; pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter}; pub use crate::schema::{Document, TantivyDocument, Term}; diff --git a/src/postings/json_postings_writer.rs b/src/postings/json_postings_writer.rs index ed3d5c24f3..2fab8efa35 100644 --- a/src/postings/json_postings_writer.rs +++ b/src/postings/json_postings_writer.rs @@ -3,7 +3,6 @@ use std::io; use common::json_path_writer::JSON_END_OF_PATH; use stacker::Addr; -use crate::indexer::doc_id_mapping::DocIdMapping; use crate::indexer::path_to_unordered_id::OrderedPathId; use crate::postings::postings_writer::SpecializedPostingsWriter; use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder}; @@ -62,7 +61,6 @@ impl PostingsWriter for JsonPostingsWriter { &self, term_addrs: &[(Field, OrderedPathId, &[u8], Addr)], ordered_id_to_path: &[&str], - doc_id_map: Option<&DocIdMapping>, ctx: &IndexingContext, serializer: &mut FieldSerializer, ) -> io::Result<()> { @@ -87,7 +85,6 @@ impl PostingsWriter for JsonPostingsWriter { SpecializedPostingsWriter::::serialize_one_term( term_buffer.serialized_value_bytes(), *addr, - doc_id_map, &mut buffer_lender, ctx, serializer, @@ -96,7 +93,6 @@ impl PostingsWriter for JsonPostingsWriter { SpecializedPostingsWriter::::serialize_one_term( term_buffer.serialized_value_bytes(), *addr, - doc_id_map, &mut buffer_lender, ctx, serializer, diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 264392889a..fcbf55f98c 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -5,7 +5,6 @@ use std::ops::Range; use stacker::Addr; use crate::fieldnorm::FieldNormReaders; -use crate::indexer::doc_id_mapping::DocIdMapping; use crate::indexer::path_to_unordered_id::OrderedPathId; use crate::postings::recorder::{BufferLender, Recorder}; use crate::postings::{ @@ -50,7 +49,6 @@ pub(crate) fn serialize_postings( schema: Schema, per_field_postings_writers: &PerFieldPostingsWriter, fieldnorm_readers: FieldNormReaders, - doc_id_map: Option<&DocIdMapping>, serializer: &mut InvertedIndexSerializer, ) -> crate::Result<()> { // Replace unordered ids by ordered ids to be able to sort @@ -86,7 +84,6 @@ pub(crate) fn serialize_postings( postings_writer.serialize( &term_offsets[byte_offsets], &ordered_id_to_path, - doc_id_map, &ctx, &mut field_serializer, )?; @@ -122,7 +119,6 @@ pub(crate) trait PostingsWriter: Send + Sync { &self, term_addrs: &[(Field, OrderedPathId, &[u8], Addr)], ordered_id_to_path: &[&str], - doc_id_map: Option<&DocIdMapping>, ctx: &IndexingContext, serializer: &mut FieldSerializer, ) -> io::Result<()>; @@ -187,7 +183,6 @@ impl SpecializedPostingsWriter { pub(crate) fn serialize_one_term( term: &[u8], addr: Addr, - doc_id_map: Option<&DocIdMapping>, buffer_lender: &mut BufferLender, ctx: &IndexingContext, serializer: &mut FieldSerializer, @@ -195,7 +190,7 @@ impl SpecializedPostingsWriter { let recorder: Rec = ctx.term_index.read(addr); let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32); serializer.new_term(term, term_doc_freq, recorder.has_term_freq())?; - recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender); + recorder.serialize(&ctx.arena, serializer, buffer_lender); serializer.close_term()?; Ok(()) } @@ -229,13 +224,12 @@ impl PostingsWriter for SpecializedPostingsWriter { &self, term_addrs: &[(Field, OrderedPathId, &[u8], Addr)], _ordered_id_to_path: &[&str], - doc_id_map: Option<&DocIdMapping>, ctx: &IndexingContext, serializer: &mut FieldSerializer, ) -> io::Result<()> { let mut buffer_lender = BufferLender::default(); for (_field, _path_id, term, addr) in term_addrs { - Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?; + Self::serialize_one_term(term, *addr, &mut buffer_lender, ctx, serializer)?; } Ok(()) } diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index 767441f641..de7a400f5b 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -1,7 +1,6 @@ use common::read_u32_vint; use stacker::{ExpUnrolledLinkedList, MemoryArena}; -use crate::indexer::doc_id_mapping::DocIdMapping; use crate::postings::FieldSerializer; use crate::DocId; @@ -71,7 +70,6 @@ pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static { fn serialize( &self, arena: &MemoryArena, - doc_id_map: Option<&DocIdMapping>, serializer: &mut FieldSerializer<'_>, buffer_lender: &mut BufferLender, ); @@ -115,26 +113,15 @@ impl Recorder for DocIdRecorder { fn serialize( &self, arena: &MemoryArena, - doc_id_map: Option<&DocIdMapping>, serializer: &mut FieldSerializer<'_>, buffer_lender: &mut BufferLender, ) { - let (buffer, doc_ids) = buffer_lender.lend_all(); + let buffer = buffer_lender.lend_u8(); // TODO avoid reading twice. self.stack.read_to_end(arena, buffer); - if let Some(doc_id_map) = doc_id_map { - let iter = get_sum_reader(VInt32Reader::new(&buffer[..])); - doc_ids.extend(iter.map(|old_doc_id| doc_id_map.get_new_doc_id(old_doc_id))); - doc_ids.sort_unstable(); - - for doc in doc_ids { - serializer.write_doc(*doc, 0u32, &[][..]); - } - } else { - let iter = get_sum_reader(VInt32Reader::new(&buffer[..])); - for doc_id in iter { - serializer.write_doc(doc_id, 0u32, &[][..]); - } + let iter = get_sum_reader(VInt32Reader::new(&buffer[..])); + for doc_id in iter { + serializer.write_doc(doc_id, 0u32, &[][..]); } } @@ -194,35 +181,18 @@ impl Recorder for TermFrequencyRecorder { fn serialize( &self, arena: &MemoryArena, - doc_id_map: Option<&DocIdMapping>, serializer: &mut FieldSerializer<'_>, buffer_lender: &mut BufferLender, ) { let buffer = buffer_lender.lend_u8(); self.stack.read_to_end(arena, buffer); let mut u32_it = VInt32Reader::new(&buffer[..]); - if let Some(doc_id_map) = doc_id_map { - let mut doc_id_and_tf = vec![]; - let mut prev_doc = 0; - while let Some(delta_doc_id) = u32_it.next() { - let doc_id = prev_doc + delta_doc_id; - prev_doc = doc_id; - let term_freq = u32_it.next().unwrap_or(self.current_tf); - doc_id_and_tf.push((doc_id_map.get_new_doc_id(doc_id), term_freq)); - } - doc_id_and_tf.sort_unstable_by_key(|&(doc_id, _)| doc_id); - - for (doc_id, tf) in doc_id_and_tf { - serializer.write_doc(doc_id, tf, &[][..]); - } - } else { - let mut prev_doc = 0; - while let Some(delta_doc_id) = u32_it.next() { - let doc_id = prev_doc + delta_doc_id; - prev_doc = doc_id; - let term_freq = u32_it.next().unwrap_or(self.current_tf); - serializer.write_doc(doc_id, term_freq, &[][..]); - } + let mut prev_doc = 0; + while let Some(delta_doc_id) = u32_it.next() { + let doc_id = prev_doc + delta_doc_id; + prev_doc = doc_id; + let term_freq = u32_it.next().unwrap_or(self.current_tf); + serializer.write_doc(doc_id, term_freq, &[][..]); } } @@ -268,14 +238,12 @@ impl Recorder for TfAndPositionRecorder { fn serialize( &self, arena: &MemoryArena, - doc_id_map: Option<&DocIdMapping>, serializer: &mut FieldSerializer<'_>, buffer_lender: &mut BufferLender, ) { let (buffer_u8, buffer_positions) = buffer_lender.lend_all(); self.stack.read_to_end(arena, buffer_u8); let mut u32_it = VInt32Reader::new(&buffer_u8[..]); - let mut doc_id_and_positions = vec![]; let mut prev_doc = 0; while let Some(delta_doc_id) = u32_it.next() { let doc_id = prev_doc + delta_doc_id; @@ -294,19 +262,7 @@ impl Recorder for TfAndPositionRecorder { } } } - if let Some(doc_id_map) = doc_id_map { - // this simple variant to remap may consume to much memory - doc_id_and_positions - .push((doc_id_map.get_new_doc_id(doc_id), buffer_positions.to_vec())); - } else { - serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions); - } - } - if doc_id_map.is_some() { - doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _)| doc_id); - for (doc_id, positions) in doc_id_and_positions { - serializer.write_doc(doc_id, positions.len() as u32, &positions); - } + serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions); } }