From 2e3641c2ae970e8b78c256da2dac19b12d566139 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Mon, 27 May 2024 07:33:50 +0200 Subject: [PATCH] return CompactDocValue instead of trait (#2410) The CompactDocValue is easier to handle than the trait in some cases like comparison and conversion --- examples/date_time_field.rs | 3 +- src/fastfield/facet_reader.rs | 6 +- src/indexer/index_writer.rs | 10 +- src/indexer/merger.rs | 17 ++- src/indexer/merger_sorted_index_test.rs | 9 +- src/indexer/segment_writer.rs | 11 +- src/schema/document/default_document.rs | 160 +++++++++++++----------- src/schema/field_type.rs | 4 +- src/store/mod.rs | 4 +- src/store/reader.rs | 4 +- 10 files changed, 137 insertions(+), 91 deletions(-) diff --git a/examples/date_time_field.rs b/examples/date_time_field.rs index 658c66aeae..c00a22f0de 100644 --- a/examples/date_time_field.rs +++ b/examples/date_time_field.rs @@ -4,7 +4,7 @@ use tantivy::collector::TopDocs; use tantivy::query::QueryParser; -use tantivy::schema::{DateOptions, Document, Schema, INDEXED, STORED, STRING}; +use tantivy::schema::{DateOptions, Document, Schema, Value, INDEXED, STORED, STRING}; use tantivy::{Index, IndexWriter, TantivyDocument}; fn main() -> tantivy::Result<()> { @@ -64,6 +64,7 @@ fn main() -> tantivy::Result<()> { assert!(retrieved_doc .get_first(occurred_at) .unwrap() + .as_value() .as_datetime() .is_some(),); assert_eq!( diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index dc62b2c1a4..60f825328b 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -62,7 +62,7 @@ impl FacetReader { #[cfg(test)] mod tests { - use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED}; + use crate::schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED}; use crate::{DocAddress, Index, IndexWriter, TantivyDocument}; #[test] @@ -88,7 +88,9 @@ mod tests { let doc = searcher .doc::(DocAddress::new(0u32, 0u32)) .unwrap(); - let value = doc.get_first(facet_field).and_then(|v| v.as_facet()); + let value = doc + .get_first(facet_field) + .and_then(|v| v.as_value().as_facet()); assert_eq!(value, None); } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index f12310a29a..ee8d2808f2 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -816,7 +816,7 @@ mod tests { use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery}; use crate::schema::{ self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema, - TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT, + TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED, STRING, TEXT, }; use crate::store::DOCSTORE_CACHE_CAPACITY; use crate::{ @@ -1979,7 +1979,13 @@ mod tests { .unwrap(); // test store iterator for doc in store_reader.iter::(segment_reader.alive_bitset()) { - let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap(); + let id = doc + .unwrap() + .get_first(id_field) + .unwrap() + .as_value() + .as_u64() + .unwrap(); assert!(expected_ids_and_num_occurrences.contains_key(&id)); } // test store random access diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 88494e8dfa..6cd190f2a5 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -797,7 +797,7 @@ mod tests { use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery}; use crate::schema::{ Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term, - TextFieldIndexing, INDEXED, TEXT, + TextFieldIndexing, Value, INDEXED, TEXT, }; use crate::time::OffsetDateTime; use crate::{ @@ -909,15 +909,24 @@ mod tests { } { let doc = searcher.doc::(DocAddress::new(0, 0))?; - assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("af b")); + assert_eq!( + doc.get_first(text_field).unwrap().as_value().as_str(), + Some("af b") + ); } { let doc = searcher.doc::(DocAddress::new(0, 1))?; - assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c")); + assert_eq!( + doc.get_first(text_field).unwrap().as_value().as_str(), + Some("a b c") + ); } { let doc = searcher.doc::(DocAddress::new(0, 2))?; - assert_eq!(doc.get_first(text_field).unwrap().as_str(), Some("a b c d")); + assert_eq!( + doc.get_first(text_field).unwrap().as_value().as_str(), + Some("a b c d") + ); } { let doc = searcher.doc::(DocAddress::new(0, 3))?; diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs index 9f345845d4..a23dec40b8 100644 --- a/src/indexer/merger_sorted_index_test.rs +++ b/src/indexer/merger_sorted_index_test.rs @@ -7,7 +7,7 @@ mod tests { use crate::query::QueryParser; use crate::schema::{ self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions, - TextFieldIndexing, TextOptions, + TextFieldIndexing, TextOptions, Value, }; use crate::{ DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument, @@ -280,13 +280,16 @@ mod tests { .doc::(DocAddress::new(0, blubber_pos)) .unwrap(); assert_eq!( - doc.get_first(my_text_field).unwrap().as_str(), + doc.get_first(my_text_field).unwrap().as_value().as_str(), Some("blubber") ); let doc = searcher .doc::(DocAddress::new(0, 0)) .unwrap(); - assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1000)); + assert_eq!( + doc.get_first(int_field).unwrap().as_value().as_u64(), + Some(1000) + ); } } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 4a02b0a19a..16f9e41cd5 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -500,8 +500,8 @@ mod tests { use crate::postings::{Postings, TermInfo}; use crate::query::{PhraseQuery, QueryParser}; use crate::schema::{ - Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, STORED, - STRING, TEXT, + Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value, + STORED, STRING, TEXT, }; use crate::store::{Compressor, StoreReader, StoreWriter}; use crate::time::format_description::well_known::Rfc3339; @@ -555,9 +555,12 @@ mod tests { let doc = reader.get::(0).unwrap(); assert_eq!(doc.field_values().count(), 2); - assert_eq!(doc.get_all(text_field).next().unwrap().as_str(), Some("A")); assert_eq!( - doc.get_all(text_field).nth(1).unwrap().as_str(), + doc.get_all(text_field).next().unwrap().as_value().as_str(), + Some("A") + ); + assert_eq!( + doc.get_all(text_field).nth(1).unwrap().as_value().as_str(), Some("title") ); } diff --git a/src/schema/document/default_document.rs b/src/schema/document/default_document.rs index 5b65fc3ebc..6af220db80 100644 --- a/src/schema/document/default_document.rs +++ b/src/schema/document/default_document.rs @@ -157,29 +157,24 @@ impl CompactDoc { } /// field_values accessor - pub fn field_values( - &self, - ) -> impl Iterator>)> { + pub fn field_values(&self) -> impl Iterator)> { self.field_values.iter().map(|field_val| { let field = Field::from_field_id(field_val.field as u32); - let val = self.extract_value(field_val.value_addr).unwrap(); + let val = self.get_compact_doc_value(field_val.value_addr); (field, val) }) } /// Returns all of the `ReferenceValue`s associated the given field - pub fn get_all( - &self, - field: Field, - ) -> impl Iterator>> + '_ { + pub fn get_all(&self, field: Field) -> impl Iterator> + '_ { self.field_values .iter() .filter(move |field_value| Field::from_field_id(field_value.field as u32) == field) - .map(|val| self.extract_value(val.value_addr).unwrap()) + .map(|val| self.get_compact_doc_value(val.value_addr)) } /// Returns the first `ReferenceValue` associated the given field - pub fn get_first(&self, field: Field) -> Option>> { + pub fn get_first(&self, field: Field) -> Option> { self.get_all(field).next() } @@ -299,58 +294,11 @@ impl CompactDoc { } } - fn extract_value( - &self, - ref_value: ValueAddr, - ) -> io::Result>> { - match ref_value.type_id { - ValueType::Null => Ok(ReferenceValueLeaf::Null.into()), - ValueType::Str => { - let str_ref = self.extract_str(ref_value.val_addr); - Ok(ReferenceValueLeaf::Str(str_ref).into()) - } - ValueType::Facet => { - let str_ref = self.extract_str(ref_value.val_addr); - Ok(ReferenceValueLeaf::Facet(str_ref).into()) - } - ValueType::Bytes => { - let data = self.extract_bytes(ref_value.val_addr); - Ok(ReferenceValueLeaf::Bytes(data).into()) - } - ValueType::U64 => self - .read_from::(ref_value.val_addr) - .map(ReferenceValueLeaf::U64) - .map(Into::into), - ValueType::I64 => self - .read_from::(ref_value.val_addr) - .map(ReferenceValueLeaf::I64) - .map(Into::into), - ValueType::F64 => self - .read_from::(ref_value.val_addr) - .map(ReferenceValueLeaf::F64) - .map(Into::into), - ValueType::Bool => Ok(ReferenceValueLeaf::Bool(ref_value.val_addr != 0).into()), - ValueType::Date => self - .read_from::(ref_value.val_addr) - .map(|ts| ReferenceValueLeaf::Date(DateTime::from_timestamp_nanos(ts))) - .map(Into::into), - ValueType::IpAddr => self - .read_from::(ref_value.val_addr) - .map(|num| ReferenceValueLeaf::IpAddr(Ipv6Addr::from_u128(num))) - .map(Into::into), - ValueType::PreTokStr => self - .read_from::(ref_value.val_addr) - .map(Into::into) - .map(ReferenceValueLeaf::PreTokStr) - .map(Into::into), - ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new( - self, - ref_value.val_addr, - )?)), - ValueType::Array => Ok(ReferenceValue::Array(CompactDocArrayIter::new( - self, - ref_value.val_addr, - )?)), + /// Get CompactDocValue for address + fn get_compact_doc_value(&self, value_addr: ValueAddr) -> CompactDocValue<'_> { + CompactDocValue { + container: self, + value_addr, } } @@ -410,7 +358,7 @@ impl PartialEq for CompactDoc { let convert_to_comparable_map = |doc: &CompactDoc| { let mut field_value_set: HashMap> = Default::default(); for field_value in doc.field_values.iter() { - let value: OwnedValue = doc.extract_value(field_value.value_addr).unwrap().into(); + let value: OwnedValue = doc.get_compact_doc_value(field_value.value_addr).into(); let value = serde_json::to_string(&value).unwrap(); field_value_set .entry(Field::from_field_id(field_value.field as u32)) @@ -444,7 +392,19 @@ impl DocumentDeserialize for CompactDoc { #[derive(Debug, Clone, Copy)] pub struct CompactDocValue<'a> { container: &'a CompactDoc, - value: ValueAddr, + value_addr: ValueAddr, +} +impl PartialEq for CompactDocValue<'_> { + fn eq(&self, other: &Self) -> bool { + let value1: OwnedValue = (*self).into(); + let value2: OwnedValue = (*other).into(); + value1 == value2 + } +} +impl<'a> From> for OwnedValue { + fn from(value: CompactDocValue) -> Self { + value.as_value().into() + } } impl<'a> Value<'a> for CompactDocValue<'a> { type ArrayIter = CompactDocArrayIter<'a>; @@ -452,7 +412,67 @@ impl<'a> Value<'a> for CompactDocValue<'a> { type ObjectIter = CompactDocObjectIter<'a>; fn as_value(&self) -> ReferenceValue<'a, Self> { - self.container.extract_value(self.value).unwrap() + self.get_ref_value().unwrap() + } +} +impl<'a> CompactDocValue<'a> { + fn get_ref_value(&self) -> io::Result>> { + let addr = self.value_addr.val_addr; + match self.value_addr.type_id { + ValueType::Null => Ok(ReferenceValueLeaf::Null.into()), + ValueType::Str => { + let str_ref = self.container.extract_str(addr); + Ok(ReferenceValueLeaf::Str(str_ref).into()) + } + ValueType::Facet => { + let str_ref = self.container.extract_str(addr); + Ok(ReferenceValueLeaf::Facet(str_ref).into()) + } + ValueType::Bytes => { + let data = self.container.extract_bytes(addr); + Ok(ReferenceValueLeaf::Bytes(data).into()) + } + ValueType::U64 => self + .container + .read_from::(addr) + .map(ReferenceValueLeaf::U64) + .map(Into::into), + ValueType::I64 => self + .container + .read_from::(addr) + .map(ReferenceValueLeaf::I64) + .map(Into::into), + ValueType::F64 => self + .container + .read_from::(addr) + .map(ReferenceValueLeaf::F64) + .map(Into::into), + ValueType::Bool => Ok(ReferenceValueLeaf::Bool(addr != 0).into()), + ValueType::Date => self + .container + .read_from::(addr) + .map(|ts| ReferenceValueLeaf::Date(DateTime::from_timestamp_nanos(ts))) + .map(Into::into), + ValueType::IpAddr => self + .container + .read_from::(addr) + .map(|num| ReferenceValueLeaf::IpAddr(Ipv6Addr::from_u128(num))) + .map(Into::into), + ValueType::PreTokStr => self + .container + .read_from::(addr) + .map(Into::into) + .map(ReferenceValueLeaf::PreTokStr) + .map(Into::into), + ValueType::Object => Ok(ReferenceValue::Object(CompactDocObjectIter::new( + self.container, + addr, + )?)), + ValueType::Array => Ok(ReferenceValue::Array(CompactDocArrayIter::new( + self.container, + addr, + )?)), + } } } @@ -601,9 +621,9 @@ impl<'a> Iterator for CompactDocObjectIter<'a> { let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?; let value = CompactDocValue { container: self.container, - value, + value_addr: value, }; - return Some((key, value)); + Some((key, value)) } } @@ -635,9 +655,9 @@ impl<'a> Iterator for CompactDocArrayIter<'a> { let value = ValueAddr::deserialize(&mut self.node_addresses_slice).ok()?; let value = CompactDocValue { container: self.container, - value, + value_addr: value, }; - return Some(value); + Some(value) } } @@ -668,7 +688,7 @@ impl<'a> Iterator for FieldValueIterRef<'a> { Field::from_field_id(field_value.field as u32), CompactDocValue::<'a> { container: self.container, - value: field_value.value_addr, + value_addr: field_value.value_addr, }, ) }) diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index baa5200383..70a453dbf7 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -659,9 +659,9 @@ mod tests { let schema = schema_builder.build(); let doc_json = r#"{"date": "2019-10-12T07:20:50.52+02:00"}"#; let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap(); - let date = doc.get_first(date_field).unwrap(); + let date = OwnedValue::from(doc.get_first(date_field).unwrap()); // Time zone is converted to UTC - assert_eq!("Leaf(Date(2019-10-12T05:20:50.52Z))", format!("{date:?}")); + assert_eq!("Date(2019-10-12T05:20:50.52Z)", format!("{date:?}")); } #[test] diff --git a/src/store/mod.rs b/src/store/mod.rs index e5e28be45b..48962bb4b6 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -60,7 +60,7 @@ pub mod tests { use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::fastfield::AliveBitSet; use crate::schema::{ - self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, STORED, TEXT, + self, Schema, TantivyDocument, TextFieldIndexing, TextOptions, Value, STORED, TEXT, }; use crate::{Index, IndexWriter, Term}; @@ -122,6 +122,7 @@ pub mod tests { .get::(i)? .get_first(field_title) .unwrap() + .as_value() .as_str() .unwrap(), format!("Doc {i}") @@ -133,6 +134,7 @@ pub mod tests { let title_content = doc .get_first(field_title) .unwrap() + .as_value() .as_str() .unwrap() .to_string(); diff --git a/src/store/reader.rs b/src/store/reader.rs index 1e4432e5f3..dd88b776f7 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -403,7 +403,7 @@ mod tests { use super::*; use crate::directory::RamDirectory; - use crate::schema::{Field, TantivyDocument}; + use crate::schema::{Field, TantivyDocument, Value}; use crate::store::tests::write_lorem_ipsum_store; use crate::store::Compressor; use crate::Directory; @@ -411,7 +411,7 @@ mod tests { const BLOCK_SIZE: usize = 16_384; fn get_text_field<'a>(doc: &'a TantivyDocument, field: &'a Field) -> Option<&'a str> { - doc.get_first(*field).and_then(|f| f.as_str()) + doc.get_first(*field).and_then(|f| f.as_value().as_str()) } #[test]