From f2a54c388c81c6da3235da480c82a107637c3d14 Mon Sep 17 00:00:00 2001 From: muji Date: Thu, 11 Aug 2022 14:08:01 +0800 Subject: [PATCH 1/8] Pass document by reference. --- src/index.rs | 22 +++++++++++----------- src/lib.rs | 2 +- src/query.rs | 14 +++++++------- src/query/score/default/zero_to_one.rs | 4 ++-- tests/integrations_tests.rs | 10 +++++----- 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/src/index.rs b/src/index.rs index 7e47b70..11f72a8 100644 --- a/src/index.rs +++ b/src/index.rs @@ -290,7 +290,7 @@ pub fn add_document_to_index( tokenizer: Tokenizer, filter: Filter, key: T, - doc: D, + doc: &D, ) { let docs = &mut index.docs; let fields = &mut index.fields; @@ -298,7 +298,7 @@ pub fn add_document_to_index( let mut term_counts: HashMap> = HashMap::new(); let mut all_terms: Vec = Vec::new(); for i in 0..fields.len() { - if let Some(field_value) = field_accessors[i](&doc) { + if let Some(field_value) = field_accessors[i](doc) { let fields_len = fields.len(); let mut field_details = fields.get_mut(i).unwrap(); @@ -593,7 +593,7 @@ mod tests { text: "a b c".to_string(), }; - add_document_to_index(&mut index, &field_accessors, tokenizer, filter, doc.id, doc); + add_document_to_index(&mut index, &field_accessors, tokenizer, filter, doc.id, &doc); assert_eq!(index.docs.len(), 1); let (_, added_doc) = index.docs.iter().next().unwrap(); @@ -655,7 +655,7 @@ mod tests { tokenizer, filter, doc_1.id, - doc_1.clone(), + &doc_1, ); add_document_to_index( @@ -664,7 +664,7 @@ mod tests { tokenizer, filter, doc_2.id, - doc_2.clone(), + &doc_2, ); assert_eq!(index.docs.len(), 2); @@ -725,7 +725,7 @@ mod tests { tokenizer, filter, doc_1.id, - doc_1, + &doc_1, ); } } @@ -751,7 +751,7 @@ mod tests { tokenizer, filter, doc.id, - doc, + &doc, ) } @@ -871,14 +871,14 @@ mod tests { text: "abe".to_string(), }; - add_document_to_index(&mut index, &field_accessors, tokenizer, filter, doc.id, doc); + add_document_to_index(&mut index, &field_accessors, tokenizer, filter, doc.id, &doc); add_document_to_index( &mut index, &field_accessors, tokenizer, filter, doc_2.id, - doc_2, + &doc_2, ); assert_eq!(count_nodes(&index), 5); // } @@ -899,14 +899,14 @@ mod tests { text: "ab ef".to_string(), }; - add_document_to_index(&mut index, &field_accessors, tokenizer, filter, doc.id, doc); + add_document_to_index(&mut index, &field_accessors, tokenizer, filter, doc.id, &doc); add_document_to_index( &mut index, &field_accessors, tokenizer, filter, doc_2.id, - doc_2, + &doc_2, ); assert_eq!(count_nodes(&index), 7); // } diff --git a/src/lib.rs b/src/lib.rs index 93dfe92..d68768b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -73,7 +73,7 @@ pub mod test_util { id: i, title: title.to_string(), }; - add_document_to_index(&mut index, &[title_extract], tokenizer, filter, doc.id, doc); + add_document_to_index(&mut index, &[title_extract], tokenizer, filter, doc.id, &doc); } index } diff --git a/src/query.rs b/src/query.rs index 73539f0..acf5f7b 100644 --- a/src/query.rs +++ b/src/query.rs @@ -258,7 +258,7 @@ mod tests { tokenizer, filter, doc.id, - doc, + &doc, ); } let result = query( @@ -301,7 +301,7 @@ mod tests { tokenizer, filter, doc.id, - doc, + &doc, ); } @@ -357,7 +357,7 @@ mod tests { tokenizer, filter, doc.id, - doc, + &doc, ); } @@ -401,7 +401,7 @@ mod tests { tokenizer, filter, doc.id, - doc, + &doc, ); } @@ -446,7 +446,7 @@ mod tests { tokenizer, filter, doc.id, - doc, + &doc, ); } @@ -505,7 +505,7 @@ mod tests { tokenizer, filter, doc.id, - doc, + &doc, ); } let exp = expand_term(&index, &"a".to_string(), &index.arena_index); @@ -535,7 +535,7 @@ mod tests { tokenizer, filter, doc.id, - doc, + &doc, ); } let exp = expand_term(&index, &"x".to_string(), &index.arena_index); diff --git a/src/query/score/default/zero_to_one.rs b/src/query/score/default/zero_to_one.rs index ad8e062..85511a5 100644 --- a/src/query/score/default/zero_to_one.rs +++ b/src/query/score/default/zero_to_one.rs @@ -340,7 +340,7 @@ mod tests { tokenizer, filter, doc.id, - doc, + &doc, ); } @@ -391,7 +391,7 @@ mod tests { tokenizer, filter, doc.id, - doc, + &doc, ); } diff --git a/tests/integrations_tests.rs b/tests/integrations_tests.rs index 83d7ce6..630de1e 100644 --- a/tests/integrations_tests.rs +++ b/tests/integrations_tests.rs @@ -57,7 +57,7 @@ pub fn test_add_query_delete_bm25() { tokenizer, filter, doc_1.id, - doc_1.clone(), + &doc_1, ); add_document_to_index( @@ -66,7 +66,7 @@ pub fn test_add_query_delete_bm25() { tokenizer, filter, doc_2.id, - doc_2, + &doc_2, ); // Search, expected 2 results @@ -144,7 +144,7 @@ pub fn test_add_query_delete_zero_to_one() { tokenizer, filter, doc_1.id, - doc_1.clone(), + &doc_1, ); add_document_to_index( @@ -153,7 +153,7 @@ pub fn test_add_query_delete_zero_to_one() { tokenizer, filter, doc_2.id, - doc_2, + &doc_2, ); // Search, expected 2 results @@ -215,6 +215,6 @@ pub fn it_is_thread_safe() { tokenizer, filter, doc_1.id, - doc_1.clone(), + &doc_1, ); } From 98f809f273428c0487dbd2af2815661a398d8dce Mon Sep 17 00:00:00 2001 From: muji Date: Thu, 11 Aug 2022 15:36:28 +0800 Subject: [PATCH 2/8] Run cargo fmt. --- src/index.rs | 27 ++++++++++++++++++++++++--- src/lib.rs | 9 ++++++++- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/index.rs b/src/index.rs index 11f72a8..58f8fb1 100644 --- a/src/index.rs +++ b/src/index.rs @@ -593,7 +593,14 @@ mod tests { text: "a b c".to_string(), }; - add_document_to_index(&mut index, &field_accessors, tokenizer, filter, doc.id, &doc); + add_document_to_index( + &mut index, + &field_accessors, + tokenizer, + filter, + doc.id, + &doc, + ); assert_eq!(index.docs.len(), 1); let (_, added_doc) = index.docs.iter().next().unwrap(); @@ -871,7 +878,14 @@ mod tests { text: "abe".to_string(), }; - add_document_to_index(&mut index, &field_accessors, tokenizer, filter, doc.id, &doc); + add_document_to_index( + &mut index, + &field_accessors, + tokenizer, + filter, + doc.id, + &doc, + ); add_document_to_index( &mut index, &field_accessors, @@ -899,7 +913,14 @@ mod tests { text: "ab ef".to_string(), }; - add_document_to_index(&mut index, &field_accessors, tokenizer, filter, doc.id, &doc); + add_document_to_index( + &mut index, + &field_accessors, + tokenizer, + filter, + doc.id, + &doc, + ); add_document_to_index( &mut index, &field_accessors, diff --git a/src/lib.rs b/src/lib.rs index d68768b..167e70a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -73,7 +73,14 @@ pub mod test_util { id: i, title: title.to_string(), }; - add_document_to_index(&mut index, &[title_extract], tokenizer, filter, doc.id, &doc); + add_document_to_index( + &mut index, + &[title_extract], + tokenizer, + filter, + doc.id, + &doc, + ); } index } From 391e17965e30f777ad153b3fe467ec6723197b5f Mon Sep 17 00:00:00 2001 From: muji Date: Thu, 11 Aug 2022 15:37:45 +0800 Subject: [PATCH 3/8] Fix benchmark. --- benches/test_benchmark.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benches/test_benchmark.rs b/benches/test_benchmark.rs index fdd64bc..66b8439 100644 --- a/benches/test_benchmark.rs +++ b/benches/test_benchmark.rs @@ -62,6 +62,6 @@ fn add_all_documents( id: i, title: s.to_owned(), }; - add_document_to_index(&mut index, extractor, tokenizer, filter, d.id, d); + add_document_to_index(&mut index, extractor, tokenizer, filter, d.id, &d); } } From 559ab7380776f4388081d841736c2866258c6a3c Mon Sep 17 00:00:00 2001 From: muji Date: Thu, 11 Aug 2022 17:19:33 +0800 Subject: [PATCH 4/8] Fix clippy warning. --- src/query/score/default/zero_to_one.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/query/score/default/zero_to_one.rs b/src/query/score/default/zero_to_one.rs index 85511a5..9b7141b 100644 --- a/src/query/score/default/zero_to_one.rs +++ b/src/query/score/default/zero_to_one.rs @@ -71,7 +71,7 @@ impl ScoreCalculator Date: Thu, 11 Aug 2022 17:21:45 +0800 Subject: [PATCH 5/8] Update README. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 65396ef..7e2d697 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ add_document_to_index( tokenizer, filter, doc_1.id, - doc_1.clone(), + &doc_1, ); add_document_to_index( @@ -104,7 +104,7 @@ add_document_to_index( tokenizer, filter, doc_2.id, - doc_2, + &doc_2, ); // Search, expected 2 results From fd76dc0de1a1aeefddefa2918b7965ccd15b9d68 Mon Sep 17 00:00:00 2001 From: muji Date: Thu, 11 Aug 2022 17:23:28 +0800 Subject: [PATCH 6/8] Bump version number. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 41b50a8..ee371de 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "probly-search" description = "A lightweight full-text search engine with a fully customizable scoring function" -version = "1.2.4" +version = "2.0.0-alpha-1" authors = ["marcus-pousette "] edition = "2018" license = "MIT" From 2207438ab05a6b75b6363363233ebf4b3d7c4486 Mon Sep 17 00:00:00 2001 From: muji Date: Fri, 12 Aug 2022 12:06:33 +0800 Subject: [PATCH 7/8] Run cargo fmt. --- src/query/score/default/zero_to_one.rs | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/query/score/default/zero_to_one.rs b/src/query/score/default/zero_to_one.rs index 9b7141b..b8d9b37 100644 --- a/src/query/score/default/zero_to_one.rs +++ b/src/query/score/default/zero_to_one.rs @@ -70,17 +70,14 @@ impl ScoreCalculator Date: Fri, 12 Aug 2022 12:22:00 +0800 Subject: [PATCH 8/8] Use &str in Tokenizer and Filter. --- benches/test_benchmark.rs | 10 ++++------ src/index.rs | 18 ++++++++---------- src/lib.rs | 10 ++++------ src/query.rs | 21 ++++++++++----------- src/query/score/calculator.rs | 2 +- src/utils.rs | 4 ++-- tests/integrations_tests.rs | 10 ++++------ 7 files changed, 33 insertions(+), 42 deletions(-) diff --git a/benches/test_benchmark.rs b/benches/test_benchmark.rs index 66b8439..ad2fe83 100644 --- a/benches/test_benchmark.rs +++ b/benches/test_benchmark.rs @@ -8,13 +8,11 @@ struct DocX { title: String, } -fn filter(s: &str) -> String { - s.to_owned() +fn filter(s: &str) -> &str { + s } -fn tokenizer(s: &str) -> Vec { - s.split(' ') - .map(|slice| slice.to_owned()) - .collect::>() +fn tokenizer(s: &str) -> Vec<&str> { + s.split(' ').collect::>() } pub fn test_speed(c: &mut Criterion) { diff --git a/src/index.rs b/src/index.rs index 58f8fb1..f3f6685 100644 --- a/src/index.rs +++ b/src/index.rs @@ -81,7 +81,7 @@ Document Details object stores additional information about documents. * typeparam `T` Document key. */ -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Eq)] pub struct DocumentDetails { /** Document key. It can be a simple unique ID or a direct reference to original document. @@ -308,16 +308,16 @@ pub fn add_document_to_index( // filter and count terms, ignore empty strings let mut filtered_terms_count = 0; for mut term in terms { - term = filter(&term); + term = filter(term); if !term.is_empty() { all_terms.push(term.to_owned()); filtered_terms_count += 1; - let counts = term_counts.get_mut(&term); + let counts = term_counts.get_mut(term); match counts { None => { let mut new_count = vec![0; fields_len]; new_count[i] += 1; - term_counts.insert(term, new_count); + term_counts.insert(term.to_owned(), new_count); } Some(c) => { c[i] += 1; @@ -565,14 +565,12 @@ mod tests { text: String, } - fn tokenizer(s: &str) -> Vec { - s.split(' ') - .map(|slice| slice.to_owned()) - .collect::>() + fn tokenizer(s: &str) -> Vec<&str> { + s.split(' ').collect::>() } - fn filter(s: &str) -> String { - s.to_owned() + fn filter(s: &str) -> &str { + s } fn field_accessor(doc: &Doc) -> Option<&str> { Some(doc.text.as_str()) diff --git a/src/lib.rs b/src/lib.rs index 167e70a..c027d79 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,17 +19,15 @@ pub mod test_util { id: usize, title: String, } - fn tokenizer(s: &str) -> Vec { - s.split(' ') - .map(|slice| slice.to_owned().to_lowercase()) - .collect::>() + fn tokenizer(s: &str) -> Vec<&str> { + s.split(' ').collect::>() } fn title_extract(d: &Doc) -> Option<&str> { Some(d.title.as_str()) } - fn filter(s: &str) -> String { - s.to_owned() + fn filter(s: &str) -> &str { + s } pub fn test_score<'arena, M, S: ScoreCalculator>( diff --git a/src/query.rs b/src/query.rs index acf5f7b..c204797 100644 --- a/src/query.rs +++ b/src/query.rs @@ -79,7 +79,7 @@ pub fn query>( for (query_term_index, query_term_pre_filter) in query_terms.iter().enumerate() { let query_term = filter(query_term_pre_filter); if !query_term.is_empty() { - let expanded_terms = expand_term(index, &query_term, &index.arena_index); + let expanded_terms = expand_term(index, query_term, &index.arena_index); let mut visited_documents_for_term: HashSet = HashSet::new(); for query_term_expanded in expanded_terms { let term_node_option = @@ -92,8 +92,8 @@ pub fn query>( if document_frequency > 0 { let term_expansion_data = TermData { query_term_index, - all_query_terms: &query_terms, - query_term: &query_term, + all_query_terms: query_terms.clone(), + query_term, query_term_expanded: &query_term_expanded, }; let pre_calculations = &score_calculator.before_each( @@ -224,13 +224,12 @@ mod tests { Some(d.text.as_str()) } - pub fn tokenizer(s: &str) -> Vec { - s.split(' ') - .map(|slice| slice.to_owned()) - .collect::>() + pub fn tokenizer(s: &str) -> Vec<&str> { + s.split(' ').collect::>() } - pub fn filter(s: &str) -> String { - s.to_owned() + + pub fn filter(s: &str) -> &str { + s } pub mod query { @@ -405,9 +404,9 @@ mod tests { ); } - fn custom_filter(s: &str) -> String { + fn custom_filter(s: &str) -> &str { if s == "a" { - return "".to_string(); + return ""; } filter(s) } diff --git a/src/query/score/calculator.rs b/src/query/score/calculator.rs index e1738f1..697442c 100644 --- a/src/query/score/calculator.rs +++ b/src/query/score/calculator.rs @@ -15,7 +15,7 @@ pub struct TermData<'a> { pub query_term_expanded: &'a str, // All available query terms - pub all_query_terms: &'a Vec, + pub all_query_terms: Vec<&'a str>, } pub struct FieldData<'a> { diff --git a/src/utils.rs b/src/utils.rs index 846ef0f..69280be 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,3 +1,3 @@ pub type FieldAccessor = fn(&D) -> Option<&str>; -pub type Tokenizer = fn(&str) -> Vec; -pub type Filter = fn(&str) -> String; +pub type Tokenizer = fn(&str) -> Vec<&str>; +pub type Filter = fn(&str) -> &str; diff --git a/tests/integrations_tests.rs b/tests/integrations_tests.rs index 630de1e..06c143c 100644 --- a/tests/integrations_tests.rs +++ b/tests/integrations_tests.rs @@ -15,10 +15,8 @@ struct Doc { description: String, } -fn tokenizer(s: &str) -> Vec { - s.split(' ') - .map(|slice| slice.to_owned()) - .collect::>() +fn tokenizer(s: &str) -> Vec<&str> { + s.split(' ').collect::>() } fn title_extract(d: &Doc) -> Option<&str> { Some(d.title.as_str()) @@ -28,8 +26,8 @@ fn description_extract(d: &Doc) -> Option<&str> { Some(d.description.as_str()) } -fn filter(s: &str) -> String { - s.to_owned() +fn filter(s: &str) -> &str { + s } #[test]