refactor

quantleaf · Aug 20, 2022 · 81fc550 · 81fc550
1 parent a25f1b0
commit 81fc550
Show file tree

Hide file tree

Showing 8 changed files with 55 additions and 134 deletions.
diff --git a/benches/test_benchmark.rs b/benches/test_benchmark.rs
@@ -58,6 +58,6 @@ fn add_all_documents(
             id: i,
             title: s.to_owned(),
         };
-        index.add_document(extractor, tokenizer,  d.id, &d);
+        index.add_document(extractor, tokenizer, d.id, &d);
     }
 }
diff --git a/src/index.rs b/src/index.rs
@@ -1,12 +1,12 @@
 use std::{
+    borrow::Cow,
     fmt::{Debug, Formatter},
     hash::Hash,
-    usize, borrow::Cow
-
+    usize,
 };
 
-use hashbrown::{HashMap, HashSet};
 use crate::{FieldAccessor, Tokenizer};
+use hashbrown::{HashMap, HashSet};
 extern crate typed_generational_arena;
 use typed_generational_arena::StandardArena;
 use typed_generational_arena::StandardIndex as ArenaIndex;
@@ -74,21 +74,20 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
         self.removed.as_ref()
     }
 
-
     /// Adds a document to the index.
     pub fn add_document<D>(
         &mut self,
         field_accessors: &[FieldAccessor<D>],
         tokenizer: Tokenizer,
         key: T,
-        doc: & D,
+        doc: &D,
     ) {
         let docs = &mut self.docs;
         let fields = &mut self.fields;
         let mut field_length = vec![0; fields.len()];
         let mut term_counts: HashMap<Cow<str>, Vec<usize>> = HashMap::new();
         let mut all_terms: Vec<Cow<str>> = Vec::new();
-        
+
         for i in 0..fields.len() {
             if let Some(field_value) = field_accessors[i](doc) {
                 let fields_len = fields.len();
@@ -100,25 +99,15 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
                 // filter and count terms, ignore empty strings
                 let mut filtered_terms_count = 0;
                 for term in terms {
-                    if !term.is_empty() { 
+                    if !term.is_empty() {
                         filtered_terms_count += 1;
-                        /* let counts = term_counts
-                            .entry(term.clone())
-                            .or_insert_with(|| vec![0; fields_len]); */
-
-                        let counts = match term_counts.get_mut(&term)
-                        {
-                            Some(counts) => {
-                                counts},
-                            None => {
-                                term_counts.insert(term.clone(), vec![0; fields_len]);
-                                term_counts.get_mut(&term).unwrap()
-                            }
-                        };
-                        counts[i] += 1; 
-                        all_terms.push(term);
-
-                     } 
+                        all_terms.push(term.clone());
+                        let counts = term_counts
+                            .entry(term)
+                            .or_insert_with(|| vec![0; fields_len]);
+
+                        counts[i] += 1;
+                    }
                 }
 
                 field_details.sum += filtered_terms_count;
@@ -467,7 +456,7 @@ fn create_inverted_index_nodes<T: Clone>(
 mod tests {
     use super::*;
 
-    use crate::test_util::{ tokenizer};
+    use crate::test_util::tokenizer;
 
     /// Count the amount of nodes of the index.
     ///
@@ -516,7 +505,7 @@ mod tests {
                 text: "a b c".to_string(),
             };
 
-            index.add_document(&field_accessors, tokenizer,  doc.id, &doc);
+            index.add_document(&field_accessors, tokenizer, doc.id, &doc);
 
             assert_eq!(index.docs.len(), 1);
             let (_, added_doc) = index.docs.iter().next().unwrap();
@@ -572,9 +561,9 @@ mod tests {
                 text: "b c d".to_string(),
             };
 
-            index.add_document(&field_accessors, tokenizer,  doc_1.id, &doc_1);
+            index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1);
 
-            index.add_document(&field_accessors, tokenizer,  doc_2.id, &doc_2);
+            index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
 
             assert_eq!(index.docs.len(), 2);
             assert_eq!(
@@ -598,7 +587,7 @@ mod tests {
             assert_eq!(&root.char, &char::from_u32(0).unwrap());
             assert_eq!(&root.next.is_none(), &true);
             assert_eq!(&root.first_doc.is_none(), &true);
-            
+
             let first_child = index.arena_index.get(root.first_child.unwrap()).unwrap();
             assert_eq!(&first_child.char, &char::from_u32(100).unwrap());
             assert_eq!(&first_child.first_child.is_none(), &true);
@@ -628,7 +617,7 @@ mod tests {
                 text: "a  b".to_string(), // double space could introduce empty tokens
             };
 
-            index.add_document(&field_accessors, tokenizer,  doc_1.id, &doc_1);
+            index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1);
         }
     }
 
@@ -646,7 +635,7 @@ mod tests {
             }];
 
             for doc in docs {
-                index.add_document(&[field_accessor], tokenizer,  doc.id, &doc)
+                index.add_document(&[field_accessor], tokenizer, doc.id, &doc)
             }
 
             index.remove_document(1);
@@ -765,8 +754,8 @@ mod tests {
                     text: "abe".to_string(),
                 };
 
-                index.add_document(&field_accessors, tokenizer,  doc.id, &doc);
-                index.add_document(&field_accessors, tokenizer,  doc_2.id, &doc_2);
+                index.add_document(&field_accessors, tokenizer, doc.id, &doc);
+                index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
                 assert_eq!(count_nodes(&index), 5); //
             }
 
@@ -786,8 +775,8 @@ mod tests {
                     text: "ab ef".to_string(),
                 };
 
-                index.add_document(&field_accessors, tokenizer,  doc.id, &doc);
-                index.add_document(&field_accessors, tokenizer,  doc_2.id, &doc_2);
+                index.add_document(&field_accessors, tokenizer, doc.id, &doc);
+                index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
                 assert_eq!(count_nodes(&index), 7); //
             }
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -8,10 +8,10 @@ pub use index::*;
 pub use query::QueryResult;
 
 /// Function that extracts a field value from a document.
-pub type FieldAccessor< D> = fn(& D) -> Option<&str>;
+pub type FieldAccessor<D> = fn(&D) -> Option<&str>;
 
 /// Function used to tokenize a field.
-pub type Tokenizer = fn( &str) -> Vec<Cow<'_, str>>;
+pub type Tokenizer = fn(&str) -> Vec<Cow<'_, str>>;
 
 #[cfg(test)]
 pub mod test_util {
@@ -31,15 +31,15 @@ pub mod test_util {
         pub text: String,
     }
 
-    pub fn title_extract<'a>(d: &'a Doc) -> Option<&'a str> {
+    pub fn title_extract(d: &Doc) -> Option<&str> {
         Some(d.title.as_str())
     }
 
-    pub fn text_extract<'a>(d: &'a Doc) -> Option<&'a str> {
+    pub fn text_extract(d: &Doc) -> Option<&str> {
         Some(d.text.as_str())
     }
 
-    pub fn tokenizer<'a>(s: &'a str) ->  Vec<Cow<'a, str>> {
+    pub fn tokenizer(s: &str) -> Vec<Cow<str>> {
         s.split(' ').map(Cow::from).collect::<Vec<_>>()
     }
 
@@ -50,12 +50,7 @@ pub mod test_util {
         expected: Vec<QueryResult<usize>>,
     ) {
         let fields_len = idx.fields.len();
-        let mut results = idx.query(
-            q,
-            score_calculator,
-            tokenizer,
-            &vec![1.; fields_len],
-        );
+        let mut results = idx.query(q, score_calculator, tokenizer, &vec![1.; fields_len]);
         results.sort_by(|a, b| {
             let mut sort = b.score.partial_cmp(&a.score).unwrap();
             sort = sort.then_with(|| a.key.partial_cmp(&b.key).unwrap());
@@ -82,7 +77,7 @@ pub mod test_util {
                 title: title.to_string(),
                 text: String::new(),
             };
-            index.add_document(&[title_extract], tokenizer,  doc.id, &doc);
+            index.add_document(&[title_extract], tokenizer, doc.id, &doc);
         }
         index
     }

diff --git a/src/query.rs b/src/query.rs
@@ -1,12 +1,9 @@
-use std::{
-    fmt::Debug,
-    hash::Hash,
-};
 use hashbrown::{HashMap, HashSet};
+use std::{fmt::Debug, hash::Hash};
 
 use typed_generational_arena::StandardArena;
 
-use crate::{score::*,  Index, InvertedIndexNode, Tokenizer};
+use crate::{score::*, Index, InvertedIndexNode, Tokenizer};
 
 /// Result type for querying an index.
 #[derive(Debug, PartialEq)]
@@ -23,17 +20,17 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
     /// All token separators work as a disjunction operator.
     pub fn query<'a, M, S: ScoreCalculator<T, M>>(
         &self,
-        query:  &'a str,
+        query: &'a str,
         score_calculator: &mut S,
         tokenizer: Tokenizer,
         fields_boost: &[f64],
     ) -> Vec<QueryResult<T>> {
         let removed = self.removed_documents();
-         let query_terms = tokenizer(query);/* .iter().map(|term| term.to_string()).collect() */
-         
+        let query_terms = tokenizer(query); /* .iter().map(|term| term.to_string()).collect() */
+
         let mut scores = HashMap::new();
         let query_terms_len = query_terms.len();
-        
+
         for (query_term_index, query_term) in query_terms.iter().enumerate() {
             if !query_term.is_empty() {
                 let expanded_terms = self.expand_term(query_term.as_ref(), &self.arena_index);
@@ -94,7 +91,7 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
                         }
                     }
                 }
-            } 
+            }
         }
 
         let mut result = Vec::new();
@@ -171,7 +168,6 @@ pub(crate) mod tests {
 
     use crate::test_util::*;
     use crate::Index;
-    use std::borrow::Cow;
 
     fn approx_equal(a: f64, b: f64, dp: u8) -> bool {
         let p: f64 = 10f64.powf(-(dp as f64));
@@ -198,18 +194,12 @@ pub(crate) mod tests {
                 },
             ];
             for doc in docs {
-                index.add_document(
-                    &[title_extract, text_extract],
-                    tokenizer,
-                    doc.id,
-                    &doc,
-                );
+                index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc);
             }
             let result = index.query(
                 &"a".to_string(),
                 &mut crate::score::bm25::new(),
                 tokenizer,
-
                 &[1., 1.],
             );
             assert_eq!(result.len(), 1);
@@ -237,20 +227,13 @@ pub(crate) mod tests {
             ];
 
             for doc in docs {
-                index.add_document(
-                    &[title_extract, text_extract],
-                    tokenizer,
-
-                    doc.id,
-                    &doc,
-                );
+                index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc);
             }
 
             let result = index.query(
                 &"c".to_string(),
                 &mut crate::score::bm25::new(),
                 tokenizer,
-
                 &[1., 1.],
             );
 
@@ -291,20 +274,13 @@ pub(crate) mod tests {
             ];
 
             for doc in docs {
-                index.add_document(
-                    &[title_extract, text_extract],
-                    tokenizer,
-
-                    doc.id,
-                    &doc,
-                );
+                index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc);
             }
 
             let result = index.query(
                 &"h".to_string(),
                 &mut crate::score::bm25::new(),
                 tokenizer,
-
                 &[1., 1.],
             );
             assert_eq!(result.len(), 1);
@@ -332,20 +308,13 @@ pub(crate) mod tests {
             ];
 
             for doc in docs {
-                index.add_document(
-                    &[title_extract, text_extract],
-                    tokenizer,
-
-                    doc.id,
-                    &doc,
-                );
+                index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc);
             }
 
             let result = index.query(
                 &"a d".to_string(),
                 &mut crate::score::bm25::new(),
                 tokenizer,
-
                 &[1., 1.],
             );
             assert_eq!(result.len(), 2);
@@ -388,13 +357,7 @@ pub(crate) mod tests {
             ];
 
             for doc in docs {
-                index.add_document(
-                    &[title_extract, text_extract],
-                    tokenizer,
-
-                    doc.id,
-                    &doc,
-                );
+                index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc);
             }
             let exp = index.expand_term(&"a".to_string(), &index.arena_index);
             assert_eq!(exp, vec!["adef".to_string(), "abc".to_string()]);
@@ -417,13 +380,7 @@ pub(crate) mod tests {
             ];
 
             for doc in docs {
-                index.add_document(
-                    &[title_extract, text_extract],
-                    tokenizer,
-
-                    doc.id,
-                    &doc,
-                );
+                index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc);
             }
             let exp = index.expand_term(&"x".to_string(), &index.arena_index);
             assert_eq!(exp, Vec::new() as Vec<String>);