From 81fc5501d10bbbc7a77f5ed5d1b2d514206caf4a Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Sat, 20 Aug 2022 23:25:57 +0200 Subject: [PATCH] refactor --- benches/test_benchmark.rs | 2 +- src/index.rs | 59 +++++++++++---------------- src/lib.rs | 19 ++++----- src/query.rs | 69 ++++++-------------------------- src/score/calculator.rs | 9 ++--- src/score/default/bm25.rs | 4 +- src/score/default/zero_to_one.rs | 1 - tests/integrations_tests.rs | 26 ++---------- 8 files changed, 55 insertions(+), 134 deletions(-) diff --git a/benches/test_benchmark.rs b/benches/test_benchmark.rs index a09f955..ddf2e4d 100644 --- a/benches/test_benchmark.rs +++ b/benches/test_benchmark.rs @@ -58,6 +58,6 @@ fn add_all_documents( id: i, title: s.to_owned(), }; - index.add_document(extractor, tokenizer, d.id, &d); + index.add_document(extractor, tokenizer, d.id, &d); } } diff --git a/src/index.rs b/src/index.rs index 6097863..c235097 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,12 +1,12 @@ use std::{ + borrow::Cow, fmt::{Debug, Formatter}, hash::Hash, - usize, borrow::Cow - + usize, }; -use hashbrown::{HashMap, HashSet}; use crate::{FieldAccessor, Tokenizer}; +use hashbrown::{HashMap, HashSet}; extern crate typed_generational_arena; use typed_generational_arena::StandardArena; use typed_generational_arena::StandardIndex as ArenaIndex; @@ -74,21 +74,20 @@ impl Index { self.removed.as_ref() } - /// Adds a document to the index. pub fn add_document( &mut self, field_accessors: &[FieldAccessor], tokenizer: Tokenizer, key: T, - doc: & D, + doc: &D, ) { let docs = &mut self.docs; let fields = &mut self.fields; let mut field_length = vec![0; fields.len()]; let mut term_counts: HashMap, Vec> = HashMap::new(); let mut all_terms: Vec> = Vec::new(); - + for i in 0..fields.len() { if let Some(field_value) = field_accessors[i](doc) { let fields_len = fields.len(); @@ -100,25 +99,15 @@ impl Index { // filter and count terms, ignore empty strings let mut filtered_terms_count = 0; for term in terms { - if !term.is_empty() { + if !term.is_empty() { filtered_terms_count += 1; - /* let counts = term_counts - .entry(term.clone()) - .or_insert_with(|| vec![0; fields_len]); */ - - let counts = match term_counts.get_mut(&term) - { - Some(counts) => { - counts}, - None => { - term_counts.insert(term.clone(), vec![0; fields_len]); - term_counts.get_mut(&term).unwrap() - } - }; - counts[i] += 1; - all_terms.push(term); - - } + all_terms.push(term.clone()); + let counts = term_counts + .entry(term) + .or_insert_with(|| vec![0; fields_len]); + + counts[i] += 1; + } } field_details.sum += filtered_terms_count; @@ -467,7 +456,7 @@ fn create_inverted_index_nodes( mod tests { use super::*; - use crate::test_util::{ tokenizer}; + use crate::test_util::tokenizer; /// Count the amount of nodes of the index. /// @@ -516,7 +505,7 @@ mod tests { text: "a b c".to_string(), }; - index.add_document(&field_accessors, tokenizer, doc.id, &doc); + index.add_document(&field_accessors, tokenizer, doc.id, &doc); assert_eq!(index.docs.len(), 1); let (_, added_doc) = index.docs.iter().next().unwrap(); @@ -572,9 +561,9 @@ mod tests { text: "b c d".to_string(), }; - index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1); + index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1); - index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2); + index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2); assert_eq!(index.docs.len(), 2); assert_eq!( @@ -598,7 +587,7 @@ mod tests { assert_eq!(&root.char, &char::from_u32(0).unwrap()); assert_eq!(&root.next.is_none(), &true); assert_eq!(&root.first_doc.is_none(), &true); - + let first_child = index.arena_index.get(root.first_child.unwrap()).unwrap(); assert_eq!(&first_child.char, &char::from_u32(100).unwrap()); assert_eq!(&first_child.first_child.is_none(), &true); @@ -628,7 +617,7 @@ mod tests { text: "a b".to_string(), // double space could introduce empty tokens }; - index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1); + index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1); } } @@ -646,7 +635,7 @@ mod tests { }]; for doc in docs { - index.add_document(&[field_accessor], tokenizer, doc.id, &doc) + index.add_document(&[field_accessor], tokenizer, doc.id, &doc) } index.remove_document(1); @@ -765,8 +754,8 @@ mod tests { text: "abe".to_string(), }; - index.add_document(&field_accessors, tokenizer, doc.id, &doc); - index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2); + index.add_document(&field_accessors, tokenizer, doc.id, &doc); + index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2); assert_eq!(count_nodes(&index), 5); // } @@ -786,8 +775,8 @@ mod tests { text: "ab ef".to_string(), }; - index.add_document(&field_accessors, tokenizer, doc.id, &doc); - index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2); + index.add_document(&field_accessors, tokenizer, doc.id, &doc); + index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2); assert_eq!(count_nodes(&index), 7); // } diff --git a/src/lib.rs b/src/lib.rs index 5059c24..f0db0d3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,10 +8,10 @@ pub use index::*; pub use query::QueryResult; /// Function that extracts a field value from a document. -pub type FieldAccessor< D> = fn(& D) -> Option<&str>; +pub type FieldAccessor = fn(&D) -> Option<&str>; /// Function used to tokenize a field. -pub type Tokenizer = fn( &str) -> Vec>; +pub type Tokenizer = fn(&str) -> Vec>; #[cfg(test)] pub mod test_util { @@ -31,15 +31,15 @@ pub mod test_util { pub text: String, } - pub fn title_extract<'a>(d: &'a Doc) -> Option<&'a str> { + pub fn title_extract(d: &Doc) -> Option<&str> { Some(d.title.as_str()) } - pub fn text_extract<'a>(d: &'a Doc) -> Option<&'a str> { + pub fn text_extract(d: &Doc) -> Option<&str> { Some(d.text.as_str()) } - pub fn tokenizer<'a>(s: &'a str) -> Vec> { + pub fn tokenizer(s: &str) -> Vec> { s.split(' ').map(Cow::from).collect::>() } @@ -50,12 +50,7 @@ pub mod test_util { expected: Vec>, ) { let fields_len = idx.fields.len(); - let mut results = idx.query( - q, - score_calculator, - tokenizer, - &vec![1.; fields_len], - ); + let mut results = idx.query(q, score_calculator, tokenizer, &vec![1.; fields_len]); results.sort_by(|a, b| { let mut sort = b.score.partial_cmp(&a.score).unwrap(); sort = sort.then_with(|| a.key.partial_cmp(&b.key).unwrap()); @@ -82,7 +77,7 @@ pub mod test_util { title: title.to_string(), text: String::new(), }; - index.add_document(&[title_extract], tokenizer, doc.id, &doc); + index.add_document(&[title_extract], tokenizer, doc.id, &doc); } index } diff --git a/src/query.rs b/src/query.rs index f8a6ee9..254b87b 100644 --- a/src/query.rs +++ b/src/query.rs @@ -1,12 +1,9 @@ -use std::{ - fmt::Debug, - hash::Hash, -}; use hashbrown::{HashMap, HashSet}; +use std::{fmt::Debug, hash::Hash}; use typed_generational_arena::StandardArena; -use crate::{score::*, Index, InvertedIndexNode, Tokenizer}; +use crate::{score::*, Index, InvertedIndexNode, Tokenizer}; /// Result type for querying an index. #[derive(Debug, PartialEq)] @@ -23,17 +20,17 @@ impl Index { /// All token separators work as a disjunction operator. pub fn query<'a, M, S: ScoreCalculator>( &self, - query: &'a str, + query: &'a str, score_calculator: &mut S, tokenizer: Tokenizer, fields_boost: &[f64], ) -> Vec> { let removed = self.removed_documents(); - let query_terms = tokenizer(query);/* .iter().map(|term| term.to_string()).collect() */ - + let query_terms = tokenizer(query); /* .iter().map(|term| term.to_string()).collect() */ + let mut scores = HashMap::new(); let query_terms_len = query_terms.len(); - + for (query_term_index, query_term) in query_terms.iter().enumerate() { if !query_term.is_empty() { let expanded_terms = self.expand_term(query_term.as_ref(), &self.arena_index); @@ -94,7 +91,7 @@ impl Index { } } } - } + } } let mut result = Vec::new(); @@ -171,7 +168,6 @@ pub(crate) mod tests { use crate::test_util::*; use crate::Index; - use std::borrow::Cow; fn approx_equal(a: f64, b: f64, dp: u8) -> bool { let p: f64 = 10f64.powf(-(dp as f64)); @@ -198,18 +194,12 @@ pub(crate) mod tests { }, ]; for doc in docs { - index.add_document( - &[title_extract, text_extract], - tokenizer, - doc.id, - &doc, - ); + index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc); } let result = index.query( &"a".to_string(), &mut crate::score::bm25::new(), tokenizer, - &[1., 1.], ); assert_eq!(result.len(), 1); @@ -237,20 +227,13 @@ pub(crate) mod tests { ]; for doc in docs { - index.add_document( - &[title_extract, text_extract], - tokenizer, - - doc.id, - &doc, - ); + index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc); } let result = index.query( &"c".to_string(), &mut crate::score::bm25::new(), tokenizer, - &[1., 1.], ); @@ -291,20 +274,13 @@ pub(crate) mod tests { ]; for doc in docs { - index.add_document( - &[title_extract, text_extract], - tokenizer, - - doc.id, - &doc, - ); + index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc); } let result = index.query( &"h".to_string(), &mut crate::score::bm25::new(), tokenizer, - &[1., 1.], ); assert_eq!(result.len(), 1); @@ -332,20 +308,13 @@ pub(crate) mod tests { ]; for doc in docs { - index.add_document( - &[title_extract, text_extract], - tokenizer, - - doc.id, - &doc, - ); + index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc); } let result = index.query( &"a d".to_string(), &mut crate::score::bm25::new(), tokenizer, - &[1., 1.], ); assert_eq!(result.len(), 2); @@ -388,13 +357,7 @@ pub(crate) mod tests { ]; for doc in docs { - index.add_document( - &[title_extract, text_extract], - tokenizer, - - doc.id, - &doc, - ); + index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc); } let exp = index.expand_term(&"a".to_string(), &index.arena_index); assert_eq!(exp, vec!["adef".to_string(), "abc".to_string()]); @@ -417,13 +380,7 @@ pub(crate) mod tests { ]; for doc in docs { - index.add_document( - &[title_extract, text_extract], - tokenizer, - - doc.id, - &doc, - ); + index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc); } let exp = index.expand_term(&"x".to_string(), &index.arena_index); assert_eq!(exp, Vec::new() as Vec); diff --git a/src/score/calculator.rs b/src/score/calculator.rs index c462d84..2183796 100644 --- a/src/score/calculator.rs +++ b/src/score/calculator.rs @@ -2,9 +2,9 @@ use crate::{ index::{DocumentDetails, DocumentPointer, FieldDetails, InvertedIndexNode}, QueryResult, }; -use std::{ fmt::Debug}; -use typed_generational_arena::StandardIndex as ArenaIndex; use hashbrown::HashMap; +use std::fmt::Debug; +use typed_generational_arena::StandardIndex as ArenaIndex; pub struct TermData<'a> { // Current query term index. @@ -14,9 +14,8 @@ pub struct TermData<'a> { // Current expanded term from the expanded terms generated // from the current query term `query_term` pub query_term_expanded: &'a str, - - // All available query terms - pub query_terms_len: usize + // Total number of query terms present in the query this TermData was created from + pub query_terms_len: usize, } pub struct FieldData<'a> { diff --git a/src/score/default/bm25.rs b/src/score/default/bm25.rs index 5008b50..4882af3 100644 --- a/src/score/default/bm25.rs +++ b/src/score/default/bm25.rs @@ -2,8 +2,8 @@ https://en.wikipedia.org/wiki/Okapi_BM25 */ -use std::{fmt::Debug}; -use hashbrown::{HashMap}; +use hashbrown::HashMap; +use std::fmt::Debug; use crate::{ index::{DocumentDetails, DocumentPointer, InvertedIndexNode}, diff --git a/src/score/default/zero_to_one.rs b/src/score/default/zero_to_one.rs index a8e0084..4a70020 100644 --- a/src/score/default/zero_to_one.rs +++ b/src/score/default/zero_to_one.rs @@ -380,7 +380,6 @@ mod tests { x.add_document( &[title_extract, description_extract], tokenizer, - doc.id, &doc, ); diff --git a/tests/integrations_tests.rs b/tests/integrations_tests.rs index 0270e67..7068fd5 100644 --- a/tests/integrations_tests.rs +++ b/tests/integrations_tests.rs @@ -24,7 +24,6 @@ fn description_extract(d: &Doc) -> Option<&str> { Some(d.description.as_str()) } - #[test] pub fn test_add_query_delete_bm25() { // Create index with 2 fields @@ -46,7 +45,6 @@ pub fn test_add_query_delete_bm25() { index.add_document( &[title_extract, description_extract], tokenizer, - doc_1.id, &doc_1, ); @@ -54,13 +52,12 @@ pub fn test_add_query_delete_bm25() { index.add_document( &[title_extract, description_extract], tokenizer, - doc_2.id, &doc_2, ); // Search, expected 2 results - let mut result = index.query("abc", &mut bm25::new(), tokenizer, &[1., 1.]); + let mut result = index.query("abc", &mut bm25::new(), tokenizer, &[1., 1.]); assert_eq!(result.len(), 2); assert_eq!( result[0], @@ -84,7 +81,7 @@ pub fn test_add_query_delete_bm25() { index.vacuum(); // Search, expect 1 result - result = index.query("abc", &mut bm25::new(), tokenizer, &[1., 1.]); + result = index.query("abc", &mut bm25::new(), tokenizer, &[1., 1.]); assert_eq!(result.len(), 1); assert_eq!( result[0], @@ -114,7 +111,6 @@ pub fn test_add_query_delete_zero_to_one() { index.add_document( &[title_extract, description_extract], tokenizer, - doc_1.id, &doc_1, ); @@ -122,19 +118,12 @@ pub fn test_add_query_delete_zero_to_one() { index.add_document( &[title_extract, description_extract], tokenizer, - doc_2.id, &doc_2, ); // Search, expected 2 results - let mut result = index.query( - "abc", - &mut zero_to_one::new(), - tokenizer, - - &[1., 1.], - ); + let mut result = index.query("abc", &mut zero_to_one::new(), tokenizer, &[1., 1.]); assert_eq!(result.len(), 2); assert_eq!(result[0], QueryResult { key: 0, score: 1. }); assert_eq!( @@ -148,13 +137,7 @@ pub fn test_add_query_delete_zero_to_one() { index.remove_document(doc_1.id); // Search, expect 1 result - result = index.query( - "abc", - &mut zero_to_one::new(), - tokenizer, - - &[1., 1.], - ); + result = index.query("abc", &mut zero_to_one::new(), tokenizer, &[1., 1.]); assert_eq!(result.len(), 1); assert_eq!( result[0], @@ -179,7 +162,6 @@ pub fn it_is_thread_safe() { idx.add_document( &[title_extract, description_extract], tokenizer, - doc_1.id, &doc_1, );