Skip to content

Commit

Permalink
compact doc (#2402)
Browse files Browse the repository at this point in the history
* compact doc

* add any value type

* pass references when building CompactDoc

* remove OwnedValue from API

* clippy

* clippy

* fail on large documents

* fmt

* cleanup

* cleanup

* implement Value for different types

fix serde_json date Value implementation

* fmt

* cleanup

* fmt

* cleanup

* store positions instead of pos+len

* remove nodes array

* remove mediumvec

* cleanup

* infallible serialize into vec

* remove positions indirection

* remove 24MB limitation in document

use u32 for Addr
Remove the 3 byte addressing limitation and use VInt instead

* cleanup

* extend test

* cleanup, add comments

* rename, remove pub
  • Loading branch information
PSeitz committed May 21, 2024
1 parent 5a80420 commit e1679f3
Show file tree
Hide file tree
Showing 24 changed files with 883 additions and 234 deletions.
10 changes: 5 additions & 5 deletions benches/index-bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ fn benchmark(
benchmark_dynamic_json(b, input, schema, commit, parse_json)
} else {
_benchmark(b, input, schema, commit, parse_json, |schema, doc_json| {
TantivyDocument::parse_json(&schema, doc_json).unwrap()
TantivyDocument::parse_json(schema, doc_json).unwrap()
})
}
}
Expand Down Expand Up @@ -90,8 +90,7 @@ fn benchmark_dynamic_json(
) {
let json_field = schema.get_field("json").unwrap();
_benchmark(b, input, schema, commit, parse_json, |_schema, doc_json| {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let json_val: serde_json::Value = serde_json::from_str(doc_json).unwrap();
tantivy::doc!(json_field=>json_val)
})
}
Expand Down Expand Up @@ -138,12 +137,13 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
for (prefix, schema, is_dynamic) in benches {
for commit in [false, true] {
let suffix = if commit { "with-commit" } else { "no-commit" };
for parse_json in [false] {
{
let parse_json = false;
// for parse_json in [false, true] {
let suffix = if parse_json {
format!("{}-with-json-parsing", suffix)
} else {
format!("{}", suffix)
suffix.to_string()
};

let bench_name = format!("{}{}", prefix, suffix);
Expand Down
2 changes: 1 addition & 1 deletion common/src/vint.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ pub fn read_u32_vint_no_advance(data: &[u8]) -> (u32, usize) {
(result, vlen)
}
/// Write a `u32` as a vint payload.
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
pub fn write_u32_vint<W: io::Write + ?Sized>(val: u32, writer: &mut W) -> io::Result<()> {
let mut buf = [0u8; 8];
let data = serialize_vint_u32(val, &mut buf);
writer.write_all(data)
Expand Down
11 changes: 6 additions & 5 deletions examples/date_time_field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{DateOptions, Document, OwnedValue, Schema, INDEXED, STORED, STRING};
use tantivy::schema::{DateOptions, Document, Schema, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument};

fn main() -> tantivy::Result<()> {
Expand Down Expand Up @@ -61,10 +61,11 @@ fn main() -> tantivy::Result<()> {
assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
assert!(matches!(
retrieved_doc.get_first(occurred_at),
Some(OwnedValue::Date(_))
));
assert!(retrieved_doc
.get_first(occurred_at)
.unwrap()
.as_datetime()
.is_some(),);
assert_eq!(
retrieved_doc.to_json(&schema),
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
Expand Down
5 changes: 2 additions & 3 deletions examples/faceted_search_with_tweaked_score.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
let reader = index.reader()?;
let searcher = reader.searcher();
{
let facets = vec![
let facets = [
Facet::from("/ingredient/egg"),
Facet::from("/ingredient/oil"),
Facet::from("/ingredient/garlic"),
Expand Down Expand Up @@ -94,9 +94,8 @@ fn main() -> tantivy::Result<()> {
.doc::<TantivyDocument>(*doc_id)
.unwrap()
.get_first(title)
.and_then(|v| v.as_str())
.and_then(|v| v.as_str().map(|el| el.to_string()))
.unwrap()
.to_owned()
})
.collect();
assert_eq!(titles, vec!["Fried egg", "Egg rolls"]);
Expand Down
1 change: 0 additions & 1 deletion src/fastfield/facet_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ impl FacetReader {

#[cfg(test)]
mod tests {
use crate::schema::document::Value;
use crate::schema::{Facet, FacetOptions, SchemaBuilder, STORED};
use crate::{DocAddress, Index, IndexWriter, TantivyDocument};

Expand Down
12 changes: 5 additions & 7 deletions src/indexer/doc_id_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -306,12 +306,10 @@ mod tests_indexsorting {
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
{
assert_eq!(
searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
.get_first(my_string_field),
None
);
assert!(searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
.get_first(my_string_field)
.is_none());
assert_eq!(
searcher
.doc::<TantivyDocument>(DocAddress::new(0, 3))?
Expand Down Expand Up @@ -344,7 +342,7 @@ mod tests_indexsorting {
Some("blublub")
);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
assert_eq!(doc.get_first(my_string_field), None);
assert!(doc.get_first(my_string_field).is_none());
}
// sort by field desc
let index = create_test_index(
Expand Down
3 changes: 1 addition & 2 deletions src/indexer/index_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -814,7 +814,6 @@ mod tests {
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::indexer::NoMergePolicy;
use crate::query::{BooleanQuery, Occur, Query, QueryParser, TermQuery};
use crate::schema::document::Value;
use crate::schema::{
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions, Schema,
TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
Expand Down Expand Up @@ -2013,7 +2012,7 @@ mod tests {
let mut bool2 = doc.get_all(multi_bools);
assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap());
assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap());
assert_eq!(None, bool2.next())
assert!(bool2.next().is_none())
}
}
}
Expand Down
1 change: 0 additions & 1 deletion src/indexer/merger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,6 @@ mod tests {
use crate::collector::{Count, FacetCollector};
use crate::index::{Index, SegmentId};
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::document::Value;
use crate::schema::{
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
TextFieldIndexing, INDEXED, TEXT,
Expand Down
1 change: 0 additions & 1 deletion src/indexer/merger_sorted_index_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ mod tests {
use crate::index::Index;
use crate::postings::Postings;
use crate::query::QueryParser;
use crate::schema::document::Value;
use crate::schema::{
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions,
Expand Down
26 changes: 12 additions & 14 deletions src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -499,7 +499,6 @@ mod tests {
use crate::fastfield::FastValue;
use crate::postings::{Postings, TermInfo};
use crate::query::{PhraseQuery, QueryParser};
use crate::schema::document::Value;
use crate::schema::{
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, STORED,
STRING, TEXT,
Expand Down Expand Up @@ -555,9 +554,12 @@ mod tests {
let reader = StoreReader::open(directory.open_read(path).unwrap(), 0).unwrap();
let doc = reader.get::<TantivyDocument>(0).unwrap();

assert_eq!(doc.field_values().len(), 2);
assert_eq!(doc.field_values()[0].value().as_str(), Some("A"));
assert_eq!(doc.field_values()[1].value().as_str(), Some("title"));
assert_eq!(doc.field_values().count(), 2);
assert_eq!(doc.get_all(text_field).next().unwrap().as_str(), Some("A"));
assert_eq!(
doc.get_all(text_field).nth(1).unwrap().as_str(),
Some("title")
);
}
#[test]
fn test_simple_json_indexing() {
Expand Down Expand Up @@ -641,7 +643,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
let json_val: serde_json::Value = serde_json::from_str(
r#"{
"toto": "titi",
"float": -0.2,
Expand Down Expand Up @@ -669,14 +671,10 @@ mod tests {
doc_id: 0u32,
})
.unwrap();
let serdeser_json_val = serde_json::from_str::<serde_json::Map<String, serde_json::Value>>(
&doc.to_json(&schema),
)
.unwrap()
.get("json")
.unwrap()[0]
.as_object()
let serdeser_json_val = serde_json::from_str::<serde_json::Value>(&doc.to_json(&schema))
.unwrap()
.get("json")
.unwrap()[0]
.clone();
assert_eq!(json_val, serdeser_json_val);
let segment_reader = searcher.segment_reader(0u32);
Expand Down Expand Up @@ -840,7 +838,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STRING);
let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> =
let json_val: serde_json::Value =
serde_json::from_str(r#"{"mykey": "two tokens"}"#).unwrap();
let doc = doc!(json_field=>json_val);
let index = Index::create_in_ram(schema);
Expand Down Expand Up @@ -880,7 +878,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
let json_val: serde_json::Value = serde_json::from_str(
r#"{"mykey": [{"field": "hello happy tax payer"}, {"field": "nothello"}]}"#,
)
.unwrap();
Expand Down
15 changes: 9 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -935,7 +935,7 @@ pub mod tests {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", STORED | TEXT);
let schema = schema_builder.build();
let json_val: serde_json::Map<String, serde_json::Value> = serde_json::from_str(
let json_val: serde_json::Value = serde_json::from_str(
r#"{
"signed": 2,
"float": 2.0,
Expand Down Expand Up @@ -1025,13 +1025,16 @@ pub mod tests {
text_field => "some other value",
other_text_field => "short");
assert_eq!(document.len(), 3);
let values: Vec<&OwnedValue> = document.get_all(text_field).collect();
let values: Vec<OwnedValue> = document.get_all(text_field).map(OwnedValue::from).collect();
assert_eq!(values.len(), 2);
assert_eq!(values[0].as_str(), Some("tantivy"));
assert_eq!(values[1].as_str(), Some("some other value"));
let values: Vec<&OwnedValue> = document.get_all(other_text_field).collect();
assert_eq!(values[0].as_ref().as_str(), Some("tantivy"));
assert_eq!(values[1].as_ref().as_str(), Some("some other value"));
let values: Vec<OwnedValue> = document
.get_all(other_text_field)
.map(OwnedValue::from)
.collect();
assert_eq!(values.len(), 1);
assert_eq!(values[0].as_str(), Some("short"));
assert_eq!(values[0].as_ref().as_str(), Some("short"));
}

#[test]
Expand Down
3 changes: 2 additions & 1 deletion src/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
/// );
/// # }
/// ```

#[macro_export]
macro_rules! doc(
() => {
Expand All @@ -52,7 +53,7 @@ macro_rules! doc(
{
let mut document = $crate::TantivyDocument::default();
$(
document.add_field_value($field, $value);
document.add_field_value($field, &$value);
)*
document
}
Expand Down
Loading

0 comments on commit e1679f3

Please sign in to comment.