add_document cow keys

quantleaf · Aug 20, 2022 · cb63e36 · cb63e36
1 parent 67e52f9
commit cb63e36
Show file tree

Hide file tree

Showing 9 changed files with 88 additions and 138 deletions.
diff --git a/README.md b/README.md
@@ -64,11 +64,6 @@ fn description_extract(d: &Doc) -> Option<&str> {
     Some(d.description.as_str())
 }
 
-// A no-op filter
-fn filter(s: &str) -> &str {
-   s
-}
-
 // Create index with 2 fields
 let mut index = Index::<usize>::new(2);
 

diff --git a/benches/test_benchmark.rs b/benches/test_benchmark.rs
@@ -9,10 +9,6 @@ struct DocX {
     title: String,
 }
 
-fn filter(s: &str) -> Cow<'_, str> {
-    Cow::from(s)
-}
-
 fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
     s.split(' ').map(Cow::from).collect::<Vec<_>>()
 }
@@ -62,6 +58,6 @@ fn add_all_documents(
             id: i,
             title: s.to_owned(),
         };
-        index.add_document(extractor, tokenizer, filter, d.id, &d);
+        index.add_document(extractor, tokenizer,  d.id, &d);
     }
 }
diff --git a/src/index.rs b/src/index.rs
@@ -1,11 +1,12 @@
 use std::{
-    collections::{HashMap, HashSet},
     fmt::{Debug, Formatter},
     hash::Hash,
-    usize,
+    usize, borrow::Cow
+
 };
 
-use crate::{FieldAccessor, Filter, Tokenizer};
+use std::collections::{HashMap, HashSet};
+use crate::{FieldAccessor, Tokenizer};
 extern crate typed_generational_arena;
 use typed_generational_arena::StandardArena;
 use typed_generational_arena::StandardIndex as ArenaIndex;
@@ -73,20 +74,21 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
         self.removed.as_ref()
     }
 
+
     /// Adds a document to the index.
     pub fn add_document<D>(
         &mut self,
         field_accessors: &[FieldAccessor<D>],
         tokenizer: Tokenizer,
-        filter: Filter,
         key: T,
-        doc: &D,
+        doc: & D,
     ) {
         let docs = &mut self.docs;
         let fields = &mut self.fields;
         let mut field_length = vec![0; fields.len()];
-        let mut term_counts: HashMap<String, Vec<usize>> = HashMap::new();
-        let mut all_terms: Vec<String> = Vec::new();
+        let mut term_counts: HashMap<Cow<str>, Vec<usize>> = HashMap::new();
+        let mut all_terms: Vec<Cow<str>> = Vec::new();
+
         for i in 0..fields.len() {
             if let Some(field_value) = field_accessors[i](doc) {
                 let fields_len = fields.len();
@@ -98,16 +100,25 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
                 // filter and count terms, ignore empty strings
                 let mut filtered_terms_count = 0;
                 for term in terms {
-                    let filtered = filter(term.as_ref());
-                    let term = filtered.as_ref().to_owned();
-                    if !term.is_empty() {
-                        all_terms.push(term.clone());
+                    if !term.is_empty() { 
                         filtered_terms_count += 1;
-                        let counts = term_counts
-                            .entry(term)
-                            .or_insert_with(|| vec![0; fields_len]);
-                        counts[i] += 1;
-                    }
+                        /* let counts = term_counts
+                            .entry(term.clone())
+                            .or_insert_with(|| vec![0; fields_len]); */
+
+                        let counts = match term_counts.get_mut(&term)
+                        {
+                            Some(counts) => {
+                                counts},
+                            None => {
+                                term_counts.insert(term.clone(), vec![0; fields_len]);
+                                term_counts.get_mut(&term).unwrap()
+                            }
+                        };
+                        counts[i] += 1; 
+                        all_terms.push(term);
+
+                     } 
                 }
 
                 field_details.sum += filtered_terms_count;
@@ -456,7 +467,7 @@ fn create_inverted_index_nodes<T: Clone>(
 mod tests {
     use super::*;
 
-    use crate::test_util::{filter, tokenizer};
+    use crate::test_util::{ tokenizer};
 
     /// Count the amount of nodes of the index.
     ///
@@ -505,7 +516,7 @@ mod tests {
                 text: "a b c".to_string(),
             };
 
-            index.add_document(&field_accessors, tokenizer, filter, doc.id, &doc);
+            index.add_document(&field_accessors, tokenizer,  doc.id, &doc);
 
             assert_eq!(index.docs.len(), 1);
             let (_, added_doc) = index.docs.iter().next().unwrap();
@@ -561,9 +572,9 @@ mod tests {
                 text: "b c d".to_string(),
             };
 
-            index.add_document(&field_accessors, tokenizer, filter, doc_1.id, &doc_1);
+            index.add_document(&field_accessors, tokenizer,  doc_1.id, &doc_1);
 
-            index.add_document(&field_accessors, tokenizer, filter, doc_2.id, &doc_2);
+            index.add_document(&field_accessors, tokenizer,  doc_2.id, &doc_2);
 
             assert_eq!(index.docs.len(), 2);
             assert_eq!(
@@ -587,7 +598,7 @@ mod tests {
             assert_eq!(&root.char, &char::from_u32(0).unwrap());
             assert_eq!(&root.next.is_none(), &true);
             assert_eq!(&root.first_doc.is_none(), &true);
-
+            
             let first_child = index.arena_index.get(root.first_child.unwrap()).unwrap();
             assert_eq!(&first_child.char, &char::from_u32(100).unwrap());
             assert_eq!(&first_child.first_child.is_none(), &true);
@@ -617,7 +628,7 @@ mod tests {
                 text: "a  b".to_string(), // double space could introduce empty tokens
             };
 
-            index.add_document(&field_accessors, tokenizer, filter, doc_1.id, &doc_1);
+            index.add_document(&field_accessors, tokenizer,  doc_1.id, &doc_1);
         }
     }
 
@@ -635,7 +646,7 @@ mod tests {
             }];
 
             for doc in docs {
-                index.add_document(&[field_accessor], tokenizer, filter, doc.id, &doc)
+                index.add_document(&[field_accessor], tokenizer,  doc.id, &doc)
             }
 
             index.remove_document(1);
@@ -754,8 +765,8 @@ mod tests {
                     text: "abe".to_string(),
                 };
 
-                index.add_document(&field_accessors, tokenizer, filter, doc.id, &doc);
-                index.add_document(&field_accessors, tokenizer, filter, doc_2.id, &doc_2);
+                index.add_document(&field_accessors, tokenizer,  doc.id, &doc);
+                index.add_document(&field_accessors, tokenizer,  doc_2.id, &doc_2);
                 assert_eq!(count_nodes(&index), 5); //
             }
 
@@ -775,8 +786,8 @@ mod tests {
                     text: "ab ef".to_string(),
                 };
 
-                index.add_document(&field_accessors, tokenizer, filter, doc.id, &doc);
-                index.add_document(&field_accessors, tokenizer, filter, doc_2.id, &doc_2);
+                index.add_document(&field_accessors, tokenizer,  doc.id, &doc);
+                index.add_document(&field_accessors, tokenizer,  doc_2.id, &doc_2);
                 assert_eq!(count_nodes(&index), 7); //
             }
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -8,13 +8,10 @@ pub use index::*;
 pub use query::QueryResult;
 
 /// Function that extracts a field value from a document.
-pub type FieldAccessor<D> = fn(&D) -> Option<&str>;
+pub type FieldAccessor< D> = fn(& D) -> Option<&str>;
 
 /// Function used to tokenize a field.
-pub type Tokenizer = fn(&str) -> Vec<Cow<'_, str>>;
-
-/// Function used to filter fields.
-pub type Filter = fn(&str) -> Cow<'_, str>;
+pub type Tokenizer = fn( &str) -> Vec<Cow<'_, str>>;
 
 #[cfg(test)]
 pub mod test_util {
@@ -34,22 +31,18 @@ pub mod test_util {
         pub text: String,
     }
 
-    pub fn title_extract(d: &Doc) -> Option<&str> {
+    pub fn title_extract<'a>(d: &'a Doc) -> Option<&'a str> {
         Some(d.title.as_str())
     }
 
-    pub fn text_extract(d: &Doc) -> Option<&str> {
+    pub fn text_extract<'a>(d: &'a Doc) -> Option<&'a str> {
         Some(d.text.as_str())
     }
 
-    pub fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
+    pub fn tokenizer<'a>(s: &'a str) ->  Vec<Cow<'a, str>> {
         s.split(' ').map(Cow::from).collect::<Vec<_>>()
     }
 
-    pub fn filter(s: &str) -> Cow<'_, str> {
-        Cow::from(s)
-    }
-
     pub fn test_score<'arena, M, S: ScoreCalculator<usize, M>>(
         idx: &mut Index<usize>,
         score_calculator: &mut S,
@@ -61,7 +54,6 @@ pub mod test_util {
             q,
             score_calculator,
             tokenizer,
-            filter,
             &vec![1.; fields_len],
         );
         results.sort_by(|a, b| {
@@ -90,7 +82,7 @@ pub mod test_util {
                 title: title.to_string(),
                 text: String::new(),
             };
-            index.add_document(&[title_extract], tokenizer, filter, doc.id, &doc);
+            index.add_document(&[title_extract], tokenizer,  doc.id, &doc);
         }
         index
     }