Use Cow for tokenizer.

quantleaf · Aug 20, 2022 · 2e35ea6 · 2e35ea6
1 parent 0af49e8
commit 2e35ea6
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 20 deletions.
diff --git a/benches/test_benchmark.rs b/benches/test_benchmark.rs
@@ -12,8 +12,9 @@ struct DocX {
 fn filter(s: &str) -> Cow<'_, str> {
     Cow::from(s)
 }
-fn tokenizer(s: &str) -> Vec<&str> {
-    s.split(' ').collect::<Vec<_>>()
+
+fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
+    s.split(' ').map(Cow::from).collect::<Vec<_>>()
 }
 
 pub fn test_speed(c: &mut Criterion) {

diff --git a/src/index.rs b/src/index.rs
@@ -98,13 +98,14 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
                 // filter and count terms, ignore empty strings
                 let mut filtered_terms_count = 0;
                 for term in terms {
-                    let filtered = filter(term);
+                    let filtered = filter(term.as_ref());
                     let term = filtered.as_ref().to_owned();
                     if !term.is_empty() {
                         all_terms.push(term.clone());
                         filtered_terms_count += 1;
-                        let counts = term_counts.entry(term)
-                            .or_insert(vec![0; fields_len]);
+                        let counts = term_counts
+                            .entry(term)
+                            .or_insert_with(|| vec![0; fields_len]);
                         counts[i] += 1;
                     }
                 }
@@ -448,7 +449,8 @@ fn create_inverted_index_nodes<T: Clone>(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use std::borrow::Cow;
+
+    use crate::test_util::{filter, tokenizer};
 
     /// Count the amount of nodes of the index.
     ///
@@ -478,13 +480,6 @@ mod tests {
         text: String,
     }
 
-    fn tokenizer(s: &str) -> Vec<&str> {
-        s.split(' ').collect::<Vec<_>>()
-    }
-
-    fn filter(s: &str) -> Cow<'_, str> {
-        Cow::from(s)
-    }
     fn field_accessor(doc: &Doc) -> Option<&str> {
         Some(doc.text.as_str())
     }

diff --git a/src/lib.rs b/src/lib.rs
@@ -11,7 +11,7 @@ pub use query::QueryResult;
 pub type FieldAccessor<D> = fn(&D) -> Option<&str>;
 
 /// Function used to tokenize a field.
-pub type Tokenizer = fn(&str) -> Vec<&str>;
+pub type Tokenizer = fn(&str) -> Vec<Cow<'_, str>>;
 
 /// Function used to filter fields.
 pub type Filter = fn(&str) -> Cow<'_, str>;
@@ -42,8 +42,8 @@ pub mod test_util {
         Some(d.text.as_str())
     }
 
-    pub fn tokenizer(s: &str) -> Vec<&str> {
-        s.split(' ').collect::<Vec<_>>()
+    pub fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
+        s.split(' ').map(Cow::from).collect::<Vec<_>>()
     }
 
     pub fn filter(s: &str) -> Cow<'_, str> {

diff --git a/src/score/calculator.rs b/src/score/calculator.rs
@@ -2,7 +2,7 @@ use crate::{
     index::{DocumentDetails, DocumentPointer, FieldDetails, InvertedIndexNode},
     QueryResult,
 };
-use std::{collections::HashMap, fmt::Debug};
+use std::{borrow::Cow, collections::HashMap, fmt::Debug};
 use typed_generational_arena::StandardIndex as ArenaIndex;
 
 pub struct TermData<'a> {
@@ -14,7 +14,7 @@ pub struct TermData<'a> {
     // from the current query term `query_term`
     pub query_term_expanded: &'a str,
     // All available query terms
-    pub all_query_terms: Vec<&'a str>,
+    pub all_query_terms: Vec<Cow<'a, str>>,
 }
 
 pub struct FieldData<'a> {

diff --git a/tests/integrations_tests.rs b/tests/integrations_tests.rs
@@ -12,9 +12,10 @@ struct Doc {
     description: String,
 }
 
-fn tokenizer(s: &str) -> Vec<&str> {
-    s.split(' ').collect::<Vec<_>>()
+fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
+    s.split(' ').map(Cow::from).collect::<Vec<_>>()
 }
+
 fn title_extract(d: &Doc) -> Option<&str> {
     Some(d.title.as_str())
 }