Merge 81fc550 into 67e52f9

quantleaf · Aug 20, 2022 · 348ecc7 · 348ecc7
2 parents 67e52f9 + 81fc550
commit 348ecc7
Show file tree

Hide file tree

Showing 10 changed files with 55 additions and 183 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,6 +15,7 @@ exclude = [".github/**", ".gitignore", ".rustfmt.toml"]
 
 [dependencies]
 typed-generational-arena = "0.2"
+hashbrown = "0.12.3"
 
 [dev-dependencies]
 lazy_static = "1.4.0"

diff --git a/README.md b/README.md
@@ -64,11 +64,6 @@ fn description_extract(d: &Doc) -> Option<&str> {
     Some(d.description.as_str())
 }
 
-// A no-op filter
-fn filter(s: &str) -> &str {
-   s
-}
-
 // Create index with 2 fields
 let mut index = Index::<usize>::new(2);
 

diff --git a/benches/test_benchmark.rs b/benches/test_benchmark.rs
@@ -9,10 +9,6 @@ struct DocX {
     title: String,
 }
 
-fn filter(s: &str) -> Cow<'_, str> {
-    Cow::from(s)
-}
-
 fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
     s.split(' ').map(Cow::from).collect::<Vec<_>>()
 }
@@ -62,6 +58,6 @@ fn add_all_documents(
             id: i,
             title: s.to_owned(),
         };
-        index.add_document(extractor, tokenizer, filter, d.id, &d);
+        index.add_document(extractor, tokenizer, d.id, &d);
     }
 }
diff --git a/src/index.rs b/src/index.rs
@@ -1,11 +1,12 @@
 use std::{
-    collections::{HashMap, HashSet},
+    borrow::Cow,
     fmt::{Debug, Formatter},
     hash::Hash,
     usize,
 };
 
-use crate::{FieldAccessor, Filter, Tokenizer};
+use crate::{FieldAccessor, Tokenizer};
+use hashbrown::{HashMap, HashSet};
 extern crate typed_generational_arena;
 use typed_generational_arena::StandardArena;
 use typed_generational_arena::StandardIndex as ArenaIndex;
@@ -78,15 +79,15 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
         &mut self,
         field_accessors: &[FieldAccessor<D>],
         tokenizer: Tokenizer,
-        filter: Filter,
         key: T,
         doc: &D,
     ) {
         let docs = &mut self.docs;
         let fields = &mut self.fields;
         let mut field_length = vec![0; fields.len()];
-        let mut term_counts: HashMap<String, Vec<usize>> = HashMap::new();
-        let mut all_terms: Vec<String> = Vec::new();
+        let mut term_counts: HashMap<Cow<str>, Vec<usize>> = HashMap::new();
+        let mut all_terms: Vec<Cow<str>> = Vec::new();
+
         for i in 0..fields.len() {
             if let Some(field_value) = field_accessors[i](doc) {
                 let fields_len = fields.len();
@@ -98,14 +99,13 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
                 // filter and count terms, ignore empty strings
                 let mut filtered_terms_count = 0;
                 for term in terms {
-                    let filtered = filter(term.as_ref());
-                    let term = filtered.as_ref().to_owned();
                     if !term.is_empty() {
-                        all_terms.push(term.clone());
                         filtered_terms_count += 1;
+                        all_terms.push(term.clone());
                         let counts = term_counts
                             .entry(term)
                             .or_insert_with(|| vec![0; fields_len]);
+
                         counts[i] += 1;
                     }
                 }
@@ -456,7 +456,7 @@ fn create_inverted_index_nodes<T: Clone>(
 mod tests {
     use super::*;
 
-    use crate::test_util::{filter, tokenizer};
+    use crate::test_util::tokenizer;
 
     /// Count the amount of nodes of the index.
     ///
@@ -505,7 +505,7 @@ mod tests {
                 text: "a b c".to_string(),
             };
 
-            index.add_document(&field_accessors, tokenizer, filter, doc.id, &doc);
+            index.add_document(&field_accessors, tokenizer, doc.id, &doc);
 
             assert_eq!(index.docs.len(), 1);
             let (_, added_doc) = index.docs.iter().next().unwrap();
@@ -561,9 +561,9 @@ mod tests {
                 text: "b c d".to_string(),
             };
 
-            index.add_document(&field_accessors, tokenizer, filter, doc_1.id, &doc_1);
+            index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1);
 
-            index.add_document(&field_accessors, tokenizer, filter, doc_2.id, &doc_2);
+            index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
 
             assert_eq!(index.docs.len(), 2);
             assert_eq!(
@@ -617,7 +617,7 @@ mod tests {
                 text: "a  b".to_string(), // double space could introduce empty tokens
             };
 
-            index.add_document(&field_accessors, tokenizer, filter, doc_1.id, &doc_1);
+            index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1);
         }
     }
 
@@ -635,7 +635,7 @@ mod tests {
             }];
 
             for doc in docs {
-                index.add_document(&[field_accessor], tokenizer, filter, doc.id, &doc)
+                index.add_document(&[field_accessor], tokenizer, doc.id, &doc)
             }
 
             index.remove_document(1);
@@ -754,8 +754,8 @@ mod tests {
                     text: "abe".to_string(),
                 };
 
-                index.add_document(&field_accessors, tokenizer, filter, doc.id, &doc);
-                index.add_document(&field_accessors, tokenizer, filter, doc_2.id, &doc_2);
+                index.add_document(&field_accessors, tokenizer, doc.id, &doc);
+                index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
                 assert_eq!(count_nodes(&index), 5); //
             }
 
@@ -775,8 +775,8 @@ mod tests {
                     text: "ab ef".to_string(),
                 };
 
-                index.add_document(&field_accessors, tokenizer, filter, doc.id, &doc);
-                index.add_document(&field_accessors, tokenizer, filter, doc_2.id, &doc_2);
+                index.add_document(&field_accessors, tokenizer, doc.id, &doc);
+                index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
                 assert_eq!(count_nodes(&index), 7); //
             }
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -13,9 +13,6 @@ pub type FieldAccessor<D> = fn(&D) -> Option<&str>;
 /// Function used to tokenize a field.
 pub type Tokenizer = fn(&str) -> Vec<Cow<'_, str>>;
 
-/// Function used to filter fields.
-pub type Filter = fn(&str) -> Cow<'_, str>;
-
 #[cfg(test)]
 pub mod test_util {
 
@@ -42,28 +39,18 @@ pub mod test_util {
         Some(d.text.as_str())
     }
 
-    pub fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
+    pub fn tokenizer(s: &str) -> Vec<Cow<str>> {
         s.split(' ').map(Cow::from).collect::<Vec<_>>()
     }
 
-    pub fn filter(s: &str) -> Cow<'_, str> {
-        Cow::from(s)
-    }
-
     pub fn test_score<'arena, M, S: ScoreCalculator<usize, M>>(
         idx: &mut Index<usize>,
         score_calculator: &mut S,
         q: &str,
         expected: Vec<QueryResult<usize>>,
     ) {
         let fields_len = idx.fields.len();
-        let mut results = idx.query(
-            q,
-            score_calculator,
-            tokenizer,
-            filter,
-            &vec![1.; fields_len],
-        );
+        let mut results = idx.query(q, score_calculator, tokenizer, &vec![1.; fields_len]);
         results.sort_by(|a, b| {
             let mut sort = b.score.partial_cmp(&a.score).unwrap();
             sort = sort.then_with(|| a.key.partial_cmp(&b.key).unwrap());
@@ -90,7 +77,7 @@ pub mod test_util {
                 title: title.to_string(),
                 text: String::new(),
             };
-            index.add_document(&[title_extract], tokenizer, filter, doc.id, &doc);
+            index.add_document(&[title_extract], tokenizer, doc.id, &doc);
         }
         index
     }