From 3ee915e2e9363fdb6568b6631932e865844e8785 Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 20 Aug 2022 09:23:18 +0800 Subject: [PATCH 1/9] Query is no longer mutable. --- src/query.rs | 86 +++++++++++++++++++-------------------- src/score/default/bm25.rs | 4 +- 2 files changed, 44 insertions(+), 46 deletions(-) diff --git a/src/query.rs b/src/query.rs index 6887f9f..826354d 100644 --- a/src/query.rs +++ b/src/query.rs @@ -33,7 +33,7 @@ impl Index { returns Array of QueryResult structs */ pub fn query>( - &mut self, + &self, query: &str, score_calculator: &mut S, tokenizer: Tokenizer, @@ -55,52 +55,49 @@ impl Index { &self.arena_index, ); if let Some(term_node_index) = term_node_option { - let document_frequency = - self.disconnect_and_count_documents(term_node_index, removed); + let document_frequency = 1; let term_node = self.arena_index.get(term_node_index).unwrap(); if let Some(term_node_option_first_doc) = term_node.first_doc { - if document_frequency > 0 { - let term_expansion_data = TermData { - query_term_index, - all_query_terms: query_terms.clone(), - query_term, - query_term_expanded: &query_term_expanded, - }; - let pre_calculations = &score_calculator.before_each( - &term_expansion_data, - document_frequency, - &self.docs, - ); - - let mut pointer = Some(term_node_option_first_doc); - while let Some(p) = pointer { - let pointer_borrowed = self.arena_doc.get(p).unwrap(); - let key = &pointer_borrowed.details_key; - if removed.is_none() || !removed.unwrap().contains(key) { - let fields = &self.fields; - let score = &score_calculator.score( - pre_calculations.as_ref(), - pointer_borrowed, - self.docs.get(key).unwrap(), - &term_node_index, - &FieldData { - fields_boost, - fields, - }, - &term_expansion_data, + let term_expansion_data = TermData { + query_term_index, + all_query_terms: query_terms.clone(), + query_term, + query_term_expanded: &query_term_expanded, + }; + let pre_calculations = &score_calculator.before_each( + &term_expansion_data, + document_frequency, + &self.docs, + ); + + let mut pointer = Some(term_node_option_first_doc); + while let Some(p) = pointer { + let pointer_borrowed = self.arena_doc.get(p).unwrap(); + let key = &pointer_borrowed.details_key; + if removed.is_none() || !removed.unwrap().contains(key) { + let fields = &self.fields; + let score = &score_calculator.score( + pre_calculations.as_ref(), + pointer_borrowed, + self.docs.get(key).unwrap(), + &term_node_index, + &FieldData { + fields_boost, + fields, + }, + &term_expansion_data, + ); + if let Some(s) = score { + let new_score = max_score_merger( + s, + scores.get(key), + visited_documents_for_term.contains(key), ); - if let Some(s) = score { - let new_score = max_score_merger( - s, - scores.get(key), - visited_documents_for_term.contains(key), - ); - scores.insert(*key, new_score); - } + scores.insert(*key, new_score); } - visited_documents_for_term.insert(*key); - pointer = pointer_borrowed.next; } + visited_documents_for_term.insert(*key); + pointer = pointer_borrowed.next; } } } @@ -266,9 +263,10 @@ pub(crate) mod tests { &[1., 1.], None, ); + assert_eq!(result.len(), 2); assert_eq!( - approx_equal(result.get(0).unwrap().score, 0.1823215567939546, 8), + approx_equal(result.get(0).unwrap().score, 0.6931471805599453, 8), true ); assert_eq!( @@ -276,7 +274,7 @@ pub(crate) mod tests { true ); assert_eq!( - approx_equal(result.get(1).unwrap().score, 0.1823215567939546, 8), + approx_equal(result.get(1).unwrap().score, 0.6931471805599453, 8), true ); assert_eq!( diff --git a/src/score/default/bm25.rs b/src/score/default/bm25.rs index 5409d9e..239a048 100644 --- a/src/score/default/bm25.rs +++ b/src/score/default/bm25.rs @@ -127,11 +127,11 @@ mod tests { vec![ QueryResult { key: 0, - score: 0.1823215567939546, + score: 0.6931471805599453, }, QueryResult { key: 1, - score: 0.1823215567939546, + score: 0.6931471805599453, }, ], ); From 39e2c211cded209cd4c29eb52e964bd056ae86b1 Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 20 Aug 2022 09:43:05 +0800 Subject: [PATCH 2/9] Removed documents is an implementation detail. --- README.md | 7 ++---- src/index.rs | 46 +++++++++++++++++++++---------------- src/lib.rs | 1 - src/query.rs | 8 ++----- tests/integrations_tests.rs | 23 +++++-------------- 5 files changed, 36 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index d2a9b5c..3fafa2c 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,6 @@ let mut result = index.query( tokenizer, filter, &[1., 1.], - None, ); assert_eq!(result.len(), 2); assert_eq!( @@ -128,11 +127,10 @@ assert_eq!( ); // Remove documents from index -let mut removed_docs = HashSet::new(); -index.remove_document(&mut removed_docs, doc_1.id); +index.remove_document(doc_1.id); // Vacuum to remove completely -index.vacuum(&mut removed_docs); +index.vacuum(); // Search, expect 1 result result = index.query( @@ -141,7 +139,6 @@ result = index.query( tokenizer, filter, &[1., 1.], - Some(&removed_docs), ); assert_eq!(result.len(), 1); assert_eq!( diff --git a/src/index.rs b/src/index.rs index ffb8ae1..ed27aa0 100644 --- a/src/index.rs +++ b/src/index.rs @@ -26,27 +26,19 @@ pub struct Index { pub(crate) arena_index: StandardArena>, pub(crate) arena_doc: StandardArena>, + + /// Documents that have been removed from the index but + /// need to be purged. + removed: Option>, } impl Index { - /** - Creates an Index. - * typeparam `T` Document key. - * `fieldsNum` Number of fields. - * returns `Index` - */ + /// Creates an index. pub fn new(fields_num: usize) -> Self { Self::new_with_capacity(fields_num, 1000, 10000) } - /** - Creates an Index. - * typeparam `T` Document key. - * `fieldsNum` Number of fields. - * `expected_index_size` Expected node count of index tree. - * `expected_documents_count` Expected amount of documents added - * returns `Index` - */ + /// Creates an index with the expected capacity. pub fn new_with_capacity( fields_num: usize, expected_index_size: usize, @@ -63,6 +55,7 @@ impl Index { fields, arena_doc, arena_index, + removed: None, } } @@ -74,6 +67,12 @@ impl Index { self.arena_index.get_mut(self.root).unwrap() } + /// Collection of documents that have been removed from the index + /// but not yet purged. + pub(crate) fn removed_documents(&self) -> Option<&HashSet> { + self.removed.as_ref() + } + /// Adds a document to the index. pub fn add_document( &mut self, @@ -166,7 +165,13 @@ impl Index { } /// Remove document from the index. - pub fn remove_document(&mut self, removed: &mut HashSet, key: T) { + pub fn remove_document(&mut self, key: T) { + if self.removed.is_none() { + self.removed = Some(Default::default()); + } + let removed = self.removed.as_mut().unwrap(); + + //let mut removed = HashSet::new(); let fields = &mut self.fields; let doc_details_option = self.docs.get(&key); let mut remove_key = false; @@ -193,9 +198,11 @@ impl Index { } /// Cleans up removed documents from the index. - pub fn vacuum(&mut self, removed: &mut HashSet) { - self.vacuum_node(self.root, removed); + pub fn vacuum(&mut self) { + let mut removed = self.removed.take().unwrap_or_default(); + self.vacuum_node(self.root, &removed); removed.clear(); + self.removed = None; } /// Recursively cleans up removed documents from the index. @@ -616,7 +623,6 @@ mod tests { let mut index = Index::::new(1); assert_eq!(index.arena_doc.is_empty(), true); - let mut removed = HashSet::new(); let docs = vec![Doc { id: 1, text: "a".to_string(), @@ -626,8 +632,8 @@ mod tests { index.add_document(&[field_accessor], tokenizer, filter, doc.id, &doc) } - index.remove_document(&mut removed, 1); - index.vacuum(&mut removed); + index.remove_document(1); + index.vacuum(); assert_eq!(index.docs.len(), 0); assert_eq!(index.fields.len(), 1); diff --git a/src/lib.rs b/src/lib.rs index b315a72..ca26257 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -60,7 +60,6 @@ pub mod test_util { tokenizer, filter, &vec![1.; fields_len], - None, ); results.sort_by(|a, b| { let mut sort = b.score.partial_cmp(&a.score).unwrap(); diff --git a/src/query.rs b/src/query.rs index 826354d..9359097 100644 --- a/src/query.rs +++ b/src/query.rs @@ -39,8 +39,9 @@ impl Index { tokenizer: Tokenizer, filter: Filter, fields_boost: &[f64], - removed: Option<&HashSet>, + //removed: Option<&HashSet>, ) -> Vec> { + let removed = self.removed_documents(); let query_terms = tokenizer(query); let mut scores = HashMap::new(); for (query_term_index, query_term_pre_filter) in query_terms.iter().enumerate() { @@ -219,7 +220,6 @@ pub(crate) mod tests { tokenizer, filter, &[1., 1.], - None, ); assert_eq!(result.len(), 1); assert_eq!( @@ -261,7 +261,6 @@ pub(crate) mod tests { tokenizer, filter, &[1., 1.], - None, ); assert_eq!(result.len(), 2); @@ -316,7 +315,6 @@ pub(crate) mod tests { tokenizer, filter, &[1., 1.], - None, ); assert_eq!(result.len(), 1); assert_eq!( @@ -364,7 +362,6 @@ pub(crate) mod tests { tokenizer, custom_filter, &[1., 1.], - None, ); assert_eq!(result.len(), 0); } @@ -401,7 +398,6 @@ pub(crate) mod tests { tokenizer, filter, &[1., 1.], - None, ); assert_eq!(result.len(), 2); assert_eq!( diff --git a/tests/integrations_tests.rs b/tests/integrations_tests.rs index 9d55a47..fca8da8 100644 --- a/tests/integrations_tests.rs +++ b/tests/integrations_tests.rs @@ -1,4 +1,4 @@ -use std::{collections::HashSet, sync::Mutex}; +use std::sync::Mutex; use probly_search::{ score::{bm25, zero_to_one}, @@ -63,7 +63,7 @@ pub fn test_add_query_delete_bm25() { ); // Search, expected 2 results - let mut result = index.query(&"abc", &mut bm25::new(), tokenizer, filter, &[1., 1.], None); + let mut result = index.query(&"abc", &mut bm25::new(), tokenizer, filter, &[1., 1.]); assert_eq!(result.len(), 2); assert_eq!( result[0], @@ -81,21 +81,13 @@ pub fn test_add_query_delete_bm25() { ); // Remove documents from index - let mut removed_docs = HashSet::new(); - index.remove_document(&mut removed_docs, doc_1.id); + index.remove_document(doc_1.id); // Vacuum to remove completely - index.vacuum(&mut removed_docs); + index.vacuum(); // Search, expect 1 result - result = index.query( - &"abc", - &mut bm25::new(), - tokenizer, - filter, - &[1., 1.], - Some(&removed_docs), - ); + result = index.query(&"abc", &mut bm25::new(), tokenizer, filter, &[1., 1.]); assert_eq!(result.len(), 1); assert_eq!( result[0], @@ -145,7 +137,6 @@ pub fn test_add_query_delete_zero_to_one() { tokenizer, filter, &[1., 1.], - None, ); assert_eq!(result.len(), 2); assert_eq!(result[0], QueryResult { key: 0, score: 1. }); @@ -157,8 +148,7 @@ pub fn test_add_query_delete_zero_to_one() { } ); - let mut removed_docs = HashSet::new(); - index.remove_document(&mut removed_docs, doc_1.id); + index.remove_document(doc_1.id); // Search, expect 1 result result = index.query( @@ -167,7 +157,6 @@ pub fn test_add_query_delete_zero_to_one() { tokenizer, filter, &[1., 1.], - Some(&removed_docs), ); assert_eq!(result.len(), 1); assert_eq!( From 9e6d1769ad2d371bedca96d98bbc1f5d1624a1a5 Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 20 Aug 2022 10:03:40 +0800 Subject: [PATCH 3/9] Bump alpha version sequence. Bump rust edition. --- Cargo.toml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ee371de..444967f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "probly-search" description = "A lightweight full-text search engine with a fully customizable scoring function" -version = "2.0.0-alpha-1" +version = "2.0.0-alpha-2" authors = ["marcus-pousette "] -edition = "2018" +edition = "2021" license = "MIT" homepage = "https://github.com/quantleaf/probly-search" repository = "https://github.com/quantleaf/probly-search" @@ -28,9 +28,6 @@ crate-type = ["cdylib", "rlib"] name = "test_benchmark" harness = false - [profile.dev] opt-level = 0 debug = true - - From 0a8037a2eaca36fe234ad857e3a7f6b056f99ad9 Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 20 Aug 2022 10:17:46 +0800 Subject: [PATCH 4/9] Restore document frequency count in query. --- src/index.rs | 12 ++++++++++++ src/query.rs | 25 ++++++------------------- src/score/default/bm25.rs | 4 ++-- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/src/index.rs b/src/index.rs index ed27aa0..f67b557 100644 --- a/src/index.rs +++ b/src/index.rs @@ -285,6 +285,18 @@ impl Index { document_frequency } + /// Count the document frequency. + pub(crate) fn count_documents(&self, node_index: ArenaIndex>) -> usize { + let node = self.arena_index.get(node_index).unwrap(); + let mut pointer_option = node.first_doc; + let mut document_frequency = 0; + while let Some(pointer) = pointer_option { + document_frequency += 1; + pointer_option = self.arena_doc.get(pointer).unwrap().next; + } + document_frequency + } + /// Finds inverted index node that matches the `term`. pub(crate) fn find_inverted_index_node( node: ArenaIndex>, diff --git a/src/query.rs b/src/query.rs index 9359097..e40d70f 100644 --- a/src/query.rs +++ b/src/query.rs @@ -17,21 +17,9 @@ pub struct QueryResult { } impl Index { - /** - Performs a search with a simple free text query. - All token separators work as a disjunction operator. - Arguments - * typeparam `T` Document key. - * `index`. - * `query` Query string. - * `score_calculator` A struct that implements the ScoreCalculator trait to provide score calculations. - * `tokenizer Tokenizer is a function that breaks a text into words, phrases, symbols, or other meaningful elements called tokens. - * `filter` Filter is a function that processes tokens and returns terms, terms are used in Inverted Index to index documents. - * `fields_boost` Fields boost factors. - * `remove`d Set of removed document keys. - - returns Array of QueryResult structs - */ + /// Performs a search with a simple free text query. + /// + /// All token separators work as a disjunction operator. pub fn query>( &self, query: &str, @@ -39,7 +27,6 @@ impl Index { tokenizer: Tokenizer, filter: Filter, fields_boost: &[f64], - //removed: Option<&HashSet>, ) -> Vec> { let removed = self.removed_documents(); let query_terms = tokenizer(query); @@ -56,7 +43,7 @@ impl Index { &self.arena_index, ); if let Some(term_node_index) = term_node_option { - let document_frequency = 1; + let document_frequency = self.count_documents(term_node_index); let term_node = self.arena_index.get(term_node_index).unwrap(); if let Some(term_node_option_first_doc) = term_node.first_doc { let term_expansion_data = TermData { @@ -265,7 +252,7 @@ pub(crate) mod tests { assert_eq!(result.len(), 2); assert_eq!( - approx_equal(result.get(0).unwrap().score, 0.6931471805599453, 8), + approx_equal(result.get(0).unwrap().score, 0.1823215567939546, 8), true ); assert_eq!( @@ -273,7 +260,7 @@ pub(crate) mod tests { true ); assert_eq!( - approx_equal(result.get(1).unwrap().score, 0.6931471805599453, 8), + approx_equal(result.get(1).unwrap().score, 0.1823215567939546, 8), true ); assert_eq!( diff --git a/src/score/default/bm25.rs b/src/score/default/bm25.rs index 239a048..5409d9e 100644 --- a/src/score/default/bm25.rs +++ b/src/score/default/bm25.rs @@ -127,11 +127,11 @@ mod tests { vec![ QueryResult { key: 0, - score: 0.6931471805599453, + score: 0.1823215567939546, }, QueryResult { key: 1, - score: 0.6931471805599453, + score: 0.1823215567939546, }, ], ); From a63082bc37f5b08b5cb56f9b39801f31bcdcf6d5 Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 20 Aug 2022 10:53:48 +0800 Subject: [PATCH 5/9] Use Cow for filter function. --- benches/test_benchmark.rs | 5 +++-- src/index.rs | 25 +++++++++++++------------ src/lib.rs | 9 ++++++--- src/query.rs | 10 ++++++---- src/score/calculator.rs | 12 +++++------- tests/integrations_tests.rs | 6 +++--- 6 files changed, 36 insertions(+), 31 deletions(-) diff --git a/benches/test_benchmark.rs b/benches/test_benchmark.rs index 68bfd1d..3a29a1c 100644 --- a/benches/test_benchmark.rs +++ b/benches/test_benchmark.rs @@ -1,5 +1,6 @@ use criterion::{criterion_group, criterion_main, Criterion}; use probly_search::Index; +use std::borrow::Cow; criterion_group!(benches, test_speed); criterion_main!(benches); @@ -8,8 +9,8 @@ struct DocX { title: String, } -fn filter(s: &str) -> &str { - s +fn filter(s: &str) -> Cow<'_, str> { + Cow::from(s) } fn tokenizer(s: &str) -> Vec<&str> { s.split(' ').collect::>() diff --git a/src/index.rs b/src/index.rs index f67b557..ddacc7e 100644 --- a/src/index.rs +++ b/src/index.rs @@ -85,8 +85,8 @@ impl Index { let docs = &mut self.docs; let fields = &mut self.fields; let mut field_length = vec![0; fields.len()]; - let mut term_counts: HashMap<&str, Vec> = HashMap::new(); - let mut all_terms: Vec<&str> = Vec::new(); + let mut term_counts: HashMap> = HashMap::new(); + let mut all_terms: Vec = Vec::new(); for i in 0..fields.len() { if let Some(field_value) = field_accessors[i](doc) { let fields_len = fields.len(); @@ -97,12 +97,13 @@ impl Index { // filter and count terms, ignore empty strings let mut filtered_terms_count = 0; - for mut term in terms { - term = filter(term); + for term in terms { + let filtered = filter(term); + let term = filtered.as_ref().to_owned(); if !term.is_empty() { - all_terms.push(term); + all_terms.push(term.clone()); filtered_terms_count += 1; - let counts = term_counts.get_mut(term); + let counts = term_counts.get_mut(&term); match counts { None => { let mut new_count = vec![0; fields_len]; @@ -129,7 +130,7 @@ impl Index { let node = self.arena_index.get(node_index).unwrap(); if node.first_child.is_none() { node_index = - create_inverted_index_nodes(&mut self.arena_index, node_index, term, &i); + create_inverted_index_nodes(&mut self.arena_index, node_index, &term, &i); break; } let next_node = Index::::find_inverted_index_node_child_nodes_by_char( @@ -142,7 +143,7 @@ impl Index { node_index = create_inverted_index_nodes( &mut self.arena_index, node_index, - term, + &term, &i, ); break; @@ -157,7 +158,7 @@ impl Index { DocumentPointer { next: None, details_key: key.to_owned(), - term_frequency: term_counts[term].to_owned(), + term_frequency: term_counts[&term].to_owned(), }, &mut self.arena_doc, ) @@ -454,8 +455,8 @@ fn create_inverted_index_nodes( #[cfg(test)] mod tests { - use super::*; + use std::borrow::Cow; /// Count the amount of nodes of the index. /// @@ -489,8 +490,8 @@ mod tests { s.split(' ').collect::>() } - fn filter(s: &str) -> &str { - s + fn filter(s: &str) -> Cow<'_, str> { + Cow::from(s) } fn field_accessor(doc: &Doc) -> Option<&str> { Some(doc.text.as_str()) diff --git a/src/lib.rs b/src/lib.rs index ca26257..b01dc6c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,5 @@ +use std::borrow::Cow; + mod index; mod query; pub mod score; @@ -12,12 +14,13 @@ pub type FieldAccessor = fn(&D) -> Option<&str>; pub type Tokenizer = fn(&str) -> Vec<&str>; /// Function used to filter fields. -pub type Filter = fn(&str) -> &str; +pub type Filter = fn(&str) -> Cow<'_, str>; #[cfg(test)] pub mod test_util { use crate::{score::ScoreCalculator, Index, QueryResult}; + use std::borrow::Cow; fn approx_equal(a: f64, b: f64, dp: u8) -> bool { let p: f64 = 10f64.powf(-(dp as f64)); @@ -43,8 +46,8 @@ pub mod test_util { s.split(' ').collect::>() } - pub fn filter(s: &str) -> &str { - s + pub fn filter(s: &str) -> Cow<'_, str> { + Cow::from(s) } pub fn test_score<'arena, M, S: ScoreCalculator>( diff --git a/src/query.rs b/src/query.rs index e40d70f..8b870bd 100644 --- a/src/query.rs +++ b/src/query.rs @@ -33,8 +33,9 @@ impl Index { let mut scores = HashMap::new(); for (query_term_index, query_term_pre_filter) in query_terms.iter().enumerate() { let query_term = filter(query_term_pre_filter); + let query_term = query_term.as_ref(); if !query_term.is_empty() { - let expanded_terms = self.expand_term(query_term, &self.arena_index); + let expanded_terms = self.expand_term(query_term.as_ref(), &self.arena_index); let mut visited_documents_for_term = HashSet::new(); for query_term_expanded in expanded_terms { let term_node_option = Index::::find_inverted_index_node( @@ -164,9 +165,10 @@ fn max_score_merger( #[cfg(test)] pub(crate) mod tests { - use crate::Index; use crate::test_util::*; + use crate::Index; + use std::borrow::Cow; fn approx_equal(a: f64, b: f64, dp: u8) -> bool { let p: f64 = 10f64.powf(-(dp as f64)); @@ -337,9 +339,9 @@ pub(crate) mod tests { ); } - fn custom_filter(s: &str) -> &str { + fn custom_filter(s: &str) -> Cow<'_, str> { if s == "a" { - return ""; + return Cow::from(""); } filter(s) } diff --git a/src/score/calculator.rs b/src/score/calculator.rs index 4ea2c4c..f8b5066 100644 --- a/src/score/calculator.rs +++ b/src/score/calculator.rs @@ -6,14 +6,13 @@ use std::{collections::HashMap, fmt::Debug}; use typed_generational_arena::StandardIndex as ArenaIndex; pub struct TermData<'a> { - // The current query term + // Current query term index. pub query_term_index: usize, - + /// Current query term. pub query_term: &'a str, - - // The current expanded term from the expanded terms generated from the current query term `query_term` + // Current expanded term from the expanded terms generated + // from the current query term `query_term` pub query_term_expanded: &'a str, - // All available query terms pub all_query_terms: Vec<&'a str>, } @@ -21,8 +20,7 @@ pub struct TermData<'a> { pub struct FieldData<'a> { /// `fields_boost` expected boost from query arguments pub fields_boost: &'a [f64], - - /// Statistics about each field + /// Statistics about each field. pub fields: &'a [FieldDetails], } diff --git a/tests/integrations_tests.rs b/tests/integrations_tests.rs index fca8da8..03c7c2b 100644 --- a/tests/integrations_tests.rs +++ b/tests/integrations_tests.rs @@ -1,4 +1,4 @@ -use std::sync::Mutex; +use std::{borrow::Cow, sync::Mutex}; use probly_search::{ score::{bm25, zero_to_one}, @@ -23,8 +23,8 @@ fn description_extract(d: &Doc) -> Option<&str> { Some(d.description.as_str()) } -fn filter(s: &str) -> &str { - s +fn filter(s: &str) -> Cow<'_, str> { + Cow::from(s) } #[test] From 7b5fe60264308207df09691e7b9b1c31f9fcac08 Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 20 Aug 2022 16:58:14 +0800 Subject: [PATCH 6/9] Restore document_frequency test. --- src/query.rs | 76 +++++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/src/query.rs b/src/query.rs index 8b870bd..2e695a9 100644 --- a/src/query.rs +++ b/src/query.rs @@ -47,46 +47,48 @@ impl Index { let document_frequency = self.count_documents(term_node_index); let term_node = self.arena_index.get(term_node_index).unwrap(); if let Some(term_node_option_first_doc) = term_node.first_doc { - let term_expansion_data = TermData { - query_term_index, - all_query_terms: query_terms.clone(), - query_term, - query_term_expanded: &query_term_expanded, - }; - let pre_calculations = &score_calculator.before_each( - &term_expansion_data, - document_frequency, - &self.docs, - ); - - let mut pointer = Some(term_node_option_first_doc); - while let Some(p) = pointer { - let pointer_borrowed = self.arena_doc.get(p).unwrap(); - let key = &pointer_borrowed.details_key; - if removed.is_none() || !removed.unwrap().contains(key) { - let fields = &self.fields; - let score = &score_calculator.score( - pre_calculations.as_ref(), - pointer_borrowed, - self.docs.get(key).unwrap(), - &term_node_index, - &FieldData { - fields_boost, - fields, - }, - &term_expansion_data, - ); - if let Some(s) = score { - let new_score = max_score_merger( - s, - scores.get(key), - visited_documents_for_term.contains(key), + if document_frequency > 0 { + let term_expansion_data = TermData { + query_term_index, + all_query_terms: query_terms.clone(), + query_term, + query_term_expanded: &query_term_expanded, + }; + let pre_calculations = &score_calculator.before_each( + &term_expansion_data, + document_frequency, + &self.docs, + ); + + let mut pointer = Some(term_node_option_first_doc); + while let Some(p) = pointer { + let pointer_borrowed = self.arena_doc.get(p).unwrap(); + let key = &pointer_borrowed.details_key; + if removed.is_none() || !removed.unwrap().contains(key) { + let fields = &self.fields; + let score = &score_calculator.score( + pre_calculations.as_ref(), + pointer_borrowed, + self.docs.get(key).unwrap(), + &term_node_index, + &FieldData { + fields_boost, + fields, + }, + &term_expansion_data, ); - scores.insert(*key, new_score); + if let Some(s) = score { + let new_score = max_score_merger( + s, + scores.get(key), + visited_documents_for_term.contains(key), + ); + scores.insert(*key, new_score); + } } + visited_documents_for_term.insert(*key); + pointer = pointer_borrowed.next; } - visited_documents_for_term.insert(*key); - pointer = pointer_borrowed.next; } } } From 0af49e8cb798a1f12b755ac0301b9a377318e55b Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 20 Aug 2022 17:18:05 +0800 Subject: [PATCH 7/9] Tidy logic for computing word counts. In Index::add_document(). --- src/index.rs | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/index.rs b/src/index.rs index ddacc7e..d72ebfe 100644 --- a/src/index.rs +++ b/src/index.rs @@ -103,17 +103,9 @@ impl Index { if !term.is_empty() { all_terms.push(term.clone()); filtered_terms_count += 1; - let counts = term_counts.get_mut(&term); - match counts { - None => { - let mut new_count = vec![0; fields_len]; - new_count[i] += 1; - term_counts.insert(term, new_count); - } - Some(c) => { - c[i] += 1; - } - } + let counts = term_counts.entry(term) + .or_insert(vec![0; fields_len]); + counts[i] += 1; } } From 2e35ea65ac9115c636a5c757ca5ea05d0763b510 Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 20 Aug 2022 17:25:58 +0800 Subject: [PATCH 8/9] Use Cow for tokenizer. --- benches/test_benchmark.rs | 5 +++-- src/index.rs | 17 ++++++----------- src/lib.rs | 6 +++--- src/score/calculator.rs | 4 ++-- tests/integrations_tests.rs | 5 +++-- 5 files changed, 17 insertions(+), 20 deletions(-) diff --git a/benches/test_benchmark.rs b/benches/test_benchmark.rs index 3a29a1c..e7a62af 100644 --- a/benches/test_benchmark.rs +++ b/benches/test_benchmark.rs @@ -12,8 +12,9 @@ struct DocX { fn filter(s: &str) -> Cow<'_, str> { Cow::from(s) } -fn tokenizer(s: &str) -> Vec<&str> { - s.split(' ').collect::>() + +fn tokenizer(s: &str) -> Vec> { + s.split(' ').map(Cow::from).collect::>() } pub fn test_speed(c: &mut Criterion) { diff --git a/src/index.rs b/src/index.rs index d72ebfe..2b6608d 100644 --- a/src/index.rs +++ b/src/index.rs @@ -98,13 +98,14 @@ impl Index { // filter and count terms, ignore empty strings let mut filtered_terms_count = 0; for term in terms { - let filtered = filter(term); + let filtered = filter(term.as_ref()); let term = filtered.as_ref().to_owned(); if !term.is_empty() { all_terms.push(term.clone()); filtered_terms_count += 1; - let counts = term_counts.entry(term) - .or_insert(vec![0; fields_len]); + let counts = term_counts + .entry(term) + .or_insert_with(|| vec![0; fields_len]); counts[i] += 1; } } @@ -448,7 +449,8 @@ fn create_inverted_index_nodes( #[cfg(test)] mod tests { use super::*; - use std::borrow::Cow; + + use crate::test_util::{filter, tokenizer}; /// Count the amount of nodes of the index. /// @@ -478,13 +480,6 @@ mod tests { text: String, } - fn tokenizer(s: &str) -> Vec<&str> { - s.split(' ').collect::>() - } - - fn filter(s: &str) -> Cow<'_, str> { - Cow::from(s) - } fn field_accessor(doc: &Doc) -> Option<&str> { Some(doc.text.as_str()) } diff --git a/src/lib.rs b/src/lib.rs index b01dc6c..24b0310 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,7 +11,7 @@ pub use query::QueryResult; pub type FieldAccessor = fn(&D) -> Option<&str>; /// Function used to tokenize a field. -pub type Tokenizer = fn(&str) -> Vec<&str>; +pub type Tokenizer = fn(&str) -> Vec>; /// Function used to filter fields. pub type Filter = fn(&str) -> Cow<'_, str>; @@ -42,8 +42,8 @@ pub mod test_util { Some(d.text.as_str()) } - pub fn tokenizer(s: &str) -> Vec<&str> { - s.split(' ').collect::>() + pub fn tokenizer(s: &str) -> Vec> { + s.split(' ').map(Cow::from).collect::>() } pub fn filter(s: &str) -> Cow<'_, str> { diff --git a/src/score/calculator.rs b/src/score/calculator.rs index f8b5066..fe87e29 100644 --- a/src/score/calculator.rs +++ b/src/score/calculator.rs @@ -2,7 +2,7 @@ use crate::{ index::{DocumentDetails, DocumentPointer, FieldDetails, InvertedIndexNode}, QueryResult, }; -use std::{collections::HashMap, fmt::Debug}; +use std::{borrow::Cow, collections::HashMap, fmt::Debug}; use typed_generational_arena::StandardIndex as ArenaIndex; pub struct TermData<'a> { @@ -14,7 +14,7 @@ pub struct TermData<'a> { // from the current query term `query_term` pub query_term_expanded: &'a str, // All available query terms - pub all_query_terms: Vec<&'a str>, + pub all_query_terms: Vec>, } pub struct FieldData<'a> { diff --git a/tests/integrations_tests.rs b/tests/integrations_tests.rs index 03c7c2b..4b24749 100644 --- a/tests/integrations_tests.rs +++ b/tests/integrations_tests.rs @@ -12,9 +12,10 @@ struct Doc { description: String, } -fn tokenizer(s: &str) -> Vec<&str> { - s.split(' ').collect::>() +fn tokenizer(s: &str) -> Vec> { + s.split(' ').map(Cow::from).collect::>() } + fn title_extract(d: &Doc) -> Option<&str> { Some(d.title.as_str()) } From 326c5bf6b18c1482f8fb34d670350f25598cbad8 Mon Sep 17 00:00:00 2001 From: muji Date: Sat, 20 Aug 2022 17:52:51 +0800 Subject: [PATCH 9/9] Respect removed documents in count_documents(). --- src/index.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/index.rs b/src/index.rs index 2b6608d..70bdcc9 100644 --- a/src/index.rs +++ b/src/index.rs @@ -285,7 +285,13 @@ impl Index { let mut pointer_option = node.first_doc; let mut document_frequency = 0; while let Some(pointer) = pointer_option { - document_frequency += 1; + let is_removed = match &self.removed { + Some(set) => set.contains(&self.arena_doc.get(pointer).unwrap().details_key), + None => false, + }; + if !is_removed { + document_frequency += 1; + } pointer_option = self.arena_doc.get(pointer).unwrap().next; } document_frequency