compact doc (#2402)

* compact doc * add any value type * pass references when building CompactDoc * remove OwnedValue from API * clippy * clippy * fail on large documents * fmt * cleanup * cleanup * implement Value for different types fix serde_json date Value implementation * fmt * cleanup * fmt * cleanup * store positions instead of pos+len * remove nodes array * remove mediumvec * cleanup * infallible serialize into vec * remove positions indirection * remove 24MB limitation in document use u32 for Addr Remove the 3 byte addressing limitation and use VInt instead * cleanup * extend test * cleanup, add comments * rename, remove pub
quickwit-oss · May 21, 2024 · e1679f3 · e1679f3
1 parent 5a80420
commit e1679f3
Show file tree

Hide file tree

Showing 24 changed files with 883 additions and 234 deletions.
diff --git a/benches/index-bench.rs b/benches/index-bench.rs
@@ -18,7 +18,7 @@ fn benchmark(
         benchmark_dynamic_json(b, input, schema, commit, parse_json)
     } else {
         _benchmark(b, input, schema, commit, parse_json, |schema, doc_json| {
-            TantivyDocument::parse_json(&schema, doc_json).unwrap()
+            TantivyDocument::parse_json(schema, doc_json).unwrap()
         })
     }
 }
@@ -90,8 +90,7 @@ fn benchmark_dynamic_json(
 ) {
     let json_field = schema.get_field("json").unwrap();
     _benchmark(b, input, schema, commit, parse_json, |_schema, doc_json| {
-        let json_val: serde_json::Map<String, serde_json::Value> =
-            serde_json::from_str(doc_json).unwrap();
+        let json_val: serde_json::Value = serde_json::from_str(doc_json).unwrap();
         tantivy::doc!(json_field=>json_val)
     })
 }
@@ -138,12 +137,13 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
     for (prefix, schema, is_dynamic) in benches {
         for commit in [false, true] {
             let suffix = if commit { "with-commit" } else { "no-commit" };
-            for parse_json in [false] {
+            {
+                let parse_json = false;
                 // for parse_json in [false, true] {
                 let suffix = if parse_json {
                     format!("{}-with-json-parsing", suffix)
                 } else {
-                    format!("{}", suffix)
+                    suffix.to_string()
                 };
 
                 let bench_name = format!("{}{}", prefix, suffix);

diff --git a/common/src/vint.rs b/common/src/vint.rs
@@ -151,7 +151,7 @@ pub fn read_u32_vint_no_advance(data: &[u8]) -> (u32, usize) {
     (result, vlen)
 }
 /// Write a `u32` as a vint payload.
-pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
+pub fn write_u32_vint<W: io::Write + ?Sized>(val: u32, writer: &mut W) -> io::Result<()> {
     let mut buf = [0u8; 8];
     let data = serialize_vint_u32(val, &mut buf);
     writer.write_all(data)

diff --git a/examples/date_time_field.rs b/examples/date_time_field.rs
@@ -4,7 +4,7 @@
 
 use tantivy::collector::TopDocs;
 use tantivy::query::QueryParser;
-use tantivy::schema::{DateOptions, Document, OwnedValue, Schema, INDEXED, STORED, STRING};
+use tantivy::schema::{DateOptions, Document, Schema, INDEXED, STORED, STRING};
 use tantivy::{Index, IndexWriter, TantivyDocument};
 
 fn main() -> tantivy::Result<()> {
@@ -61,10 +61,11 @@ fn main() -> tantivy::Result<()> {
         assert_eq!(count_docs.len(), 1);
         for (_score, doc_address) in count_docs {
             let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
-            assert!(matches!(
-                retrieved_doc.get_first(occurred_at),
-                Some(OwnedValue::Date(_))
-            ));
+            assert!(retrieved_doc
+                .get_first(occurred_at)
+                .unwrap()
+                .as_datetime()
+                .is_some(),);
             assert_eq!(
                 retrieved_doc.to_json(&schema),
                 r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#

diff --git a/examples/faceted_search_with_tweaked_score.rs b/examples/faceted_search_with_tweaked_score.rs
@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
     let reader = index.reader()?;
     let searcher = reader.searcher();
     {
-        let facets = vec![
+        let facets = [
             Facet::from("/ingredient/egg"),
             Facet::from("/ingredient/oil"),
             Facet::from("/ingredient/garlic"),
@@ -94,9 +94,8 @@ fn main() -> tantivy::Result<()> {
                     .doc::<TantivyDocument>(*doc_id)
                     .unwrap()
                     .get_first(title)
-                    .and_then(|v| v.as_str())
+                    .and_then(|v| v.as_str().map(|el| el.to_string()))
                     .unwrap()
-                    .to_owned()
             })
             .collect();
         assert_eq!(titles, vec!["Fried egg", "Egg rolls"]);

diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs
@@ -62,7 +62,6 @@ impl FacetReader {
 
 #[cfg(test)]
 mod tests {
-    use crate::schema::document::Value;
     use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED};
     use crate::{DocAddress, Index, IndexWriter, TantivyDocument};
 

diff --git a/src/indexer/doc_id_mapping.rs b/src/indexer/doc_id_mapping.rs
@@ -306,12 +306,10 @@ mod tests_indexsorting {
         let my_string_field = index.schema().get_field("string_field").unwrap();
         let searcher = index.reader()?.searcher();
         {
-            assert_eq!(
-                searcher
-                    .doc::<TantivyDocument>(DocAddress::new(0, 0))?
-                    .get_first(my_string_field),
-                None
-            );
+            assert!(searcher
+                .doc::<TantivyDocument>(DocAddress::new(0, 0))?
+                .get_first(my_string_field)
+                .is_none());
             assert_eq!(
                 searcher
                     .doc::<TantivyDocument>(DocAddress::new(0, 3))?
@@ -344,7 +342,7 @@ mod tests_indexsorting {
                 Some("blublub")
             );
             let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
-            assert_eq!(doc.get_first(my_string_field), None);
+            assert!(doc.get_first(my_string_field).is_none());
         }
         // sort by field desc
         let index = create_test_index(

diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
@@ -814,7 +814,6 @@ mod tests {
     use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
     use crate::indexer::NoMergePolicy;
     use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
-    use crate::schema::document::Value;
     use crate::schema::{
         self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
         TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
@@ -2013,7 +2012,7 @@ mod tests {
                     let mut bool2 = doc.get_all(multi_bools);
                     assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap());
                     assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap());
-                    assert_eq!(None, bool2.next())
+                    assert!(bool2.next().is_none())
                 }
             }
         }

diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs
@@ -795,7 +795,6 @@ mod tests {
     use crate::collector::{Count, FacetCollector};
     use crate::index::{Index, SegmentId};
     use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
-    use crate::schema::document::Value;
     use crate::schema::{
         Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
         TextFieldIndexing, INDEXED, TEXT,

diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs
@@ -5,7 +5,6 @@ mod tests {
     use crate::index::Index;
     use crate::postings::Postings;
     use crate::query::QueryParser;
-    use crate::schema::document::Value;
     use crate::schema::{
         self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
         TextFieldIndexing, TextOptions,

diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs
@@ -499,7 +499,6 @@ mod tests {
     use crate::fastfield::FastValue;
     use crate::postings::{Postings, TermInfo};
     use crate::query::{PhraseQuery, QueryParser};
-    use crate::schema::document::Value;
     use crate::schema::{
         Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, STORED,
         STRING, TEXT,
@@ -555,9 +554,12 @@ mod tests {
         let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
         let doc = reader.get::<TantivyDocument>(0).unwrap();
 
-        assert_eq!(doc.field_values().len(), 2);
-        assert_eq!(doc.field_values()[0].value().as_str(), Some("A"));
-        assert_eq!(doc.field_values()[1].value().as_str(), Some("title"));
+        assert_eq!(doc.field_values().count(), 2);
+        assert_eq!(doc.get_all(text_field).next().unwrap().as_str(), Some("A"));
+        assert_eq!(
+            doc.get_all(text_field).nth(1).unwrap().as_str(),
+            Some("title")
+        );
     }
     #[test]
     fn test_simple_json_indexing() {
@@ -641,7 +643,7 @@ mod tests {
         let mut schema_builder = Schema::builder();
         let json_field = schema_builder.add_json_field("json", STORED | TEXT);
         let schema = schema_builder.build();
-        let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
+        let json_val: serde_json::Value = serde_json::from_str(
             r#"{
             "toto": "titi",
             "float": -0.2,
@@ -669,14 +671,10 @@ mod tests {
                 doc_id: 0u32,
             })
             .unwrap();
-        let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
-            &doc.to_json(&schema),
-        )
-        .unwrap()
-        .get("json")
-        .unwrap()[0]
-            .as_object()
+        let serdeser_json_val = serde_json::from_str::<serde_json::Value>(&doc.to_json(&schema))
             .unwrap()
+            .get("json")
+            .unwrap()[0]
             .clone();
         assert_eq!(json_val, serdeser_json_val);
         let segment_reader = searcher.segment_reader(0u32);
@@ -840,7 +838,7 @@ mod tests {
         let mut schema_builder = Schema::builder();
         let json_field = schema_builder.add_json_field("json", STRING);
         let schema = schema_builder.build();
-        let json_val: serde_json::Map<String, serde_json::Value> =
+        let json_val: serde_json::Value =
             serde_json::from_str(r#"{"mykey": "two tokens"}"#).unwrap();
         let doc = doc!(json_field=>json_val);
         let index = Index::create_in_ram(schema);
@@ -880,7 +878,7 @@ mod tests {
         let mut schema_builder = Schema::builder();
         let json_field = schema_builder.add_json_field("json", TEXT);
         let schema = schema_builder.build();
-        let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
+        let json_val: serde_json::Value = serde_json::from_str(
             r#"{"mykey": [{"field": "hello happy tax payer"}, {"field": "nothello"}]}"#,
         )
         .unwrap();

diff --git a/src/lib.rs b/src/lib.rs
@@ -935,7 +935,7 @@ pub mod tests {
         let mut schema_builder = Schema::builder();
         let json_field = schema_builder.add_json_field("json", STORED | TEXT);
         let schema = schema_builder.build();
-        let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
+        let json_val: serde_json::Value = serde_json::from_str(
             r#"{
             "signed": 2,
             "float": 2.0,
@@ -1025,13 +1025,16 @@ pub mod tests {
                             text_field => "some other value",
                             other_text_field => "short");
         assert_eq!(document.len(), 3);
-        let values: Vec<&OwnedValue> = document.get_all(text_field).collect();
+        let values: Vec<OwnedValue> = document.get_all(text_field).map(OwnedValue::from).collect();
         assert_eq!(values.len(), 2);
-        assert_eq!(values[0].as_str(), Some("tantivy"));
-        assert_eq!(values[1].as_str(), Some("some other value"));
-        let values: Vec<&OwnedValue> = document.get_all(other_text_field).collect();
+        assert_eq!(values[0].as_ref().as_str(), Some("tantivy"));
+        assert_eq!(values[1].as_ref().as_str(), Some("some other value"));
+        let values: Vec<OwnedValue> = document
+            .get_all(other_text_field)
+            .map(OwnedValue::from)
+            .collect();
         assert_eq!(values.len(), 1);
-        assert_eq!(values[0].as_str(), Some("short"));
+        assert_eq!(values[0].as_ref().as_str(), Some("short"));
     }
 
     #[test]

diff --git a/src/macros.rs b/src/macros.rs
@@ -41,6 +41,7 @@
 /// );
 /// # }
 /// ```
+
 #[macro_export]
 macro_rules! doc(
     () => {
@@ -52,7 +53,7 @@ macro_rules! doc(
         {
             let mut document = $crate::TantivyDocument::default();
             $(
-                document.add_field_value($field, $value);
+                document.add_field_value($field, &$value);
             )*
             document
         }