Restore document frequency count in query.

quantleaf · Aug 20, 2022 · 0a8037a · 0a8037a
1 parent 9e6d176
commit 0a8037a
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 21 deletions.
diff --git a/src/index.rs b/src/index.rs
@@ -285,6 +285,18 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
         document_frequency
     }
 
+    /// Count the document frequency.
+    pub(crate) fn count_documents(&self, node_index: ArenaIndex<InvertedIndexNode<T>>) -> usize {
+        let node = self.arena_index.get(node_index).unwrap();
+        let mut pointer_option = node.first_doc;
+        let mut document_frequency = 0;
+        while let Some(pointer) = pointer_option {
+            document_frequency += 1;
+            pointer_option = self.arena_doc.get(pointer).unwrap().next;
+        }
+        document_frequency
+    }
+
     /// Finds inverted index node that matches the `term`.
     pub(crate) fn find_inverted_index_node(
         node: ArenaIndex<InvertedIndexNode<T>>,

diff --git a/src/query.rs b/src/query.rs
@@ -17,29 +17,16 @@ pub struct QueryResult<T> {
 }
 
 impl<T: Eq + Hash + Copy + Debug> Index<T> {
-    /**
-    Performs a search with a simple free text query.
-    All token separators work as a disjunction operator.
-    Arguments
-     * typeparam `T` Document key.
-     * `index`.
-     * `query` Query string.
-     * `score_calculator` A struct that implements the ScoreCalculator trait to provide score calculations.
-     * `tokenizer Tokenizer is a function that breaks a text into words, phrases, symbols, or other meaningful elements called tokens.
-     * `filter` Filter is a function that processes tokens and returns terms, terms are used in Inverted Index to index documents.
-     * `fields_boost` Fields boost factors.
-     * `remove`d Set of removed document keys.
-
-    returns Array of QueryResult structs
-    */
+    /// Performs a search with a simple free text query.
+    ///
+    /// All token separators work as a disjunction operator.
     pub fn query<M, S: ScoreCalculator<T, M>>(
         &self,
         query: &str,
         score_calculator: &mut S,
         tokenizer: Tokenizer,
         filter: Filter,
         fields_boost: &[f64],
-        //removed: Option<&HashSet<T>>,
     ) -> Vec<QueryResult<T>> {
         let removed = self.removed_documents();
         let query_terms = tokenizer(query);
@@ -56,7 +43,7 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
                         &self.arena_index,
                     );
                     if let Some(term_node_index) = term_node_option {
-                        let document_frequency = 1;
+                        let document_frequency = self.count_documents(term_node_index);
                         let term_node = self.arena_index.get(term_node_index).unwrap();
                         if let Some(term_node_option_first_doc) = term_node.first_doc {
                             let term_expansion_data = TermData {
@@ -265,15 +252,15 @@ pub(crate) mod tests {
 
             assert_eq!(result.len(), 2);
             assert_eq!(
-                approx_equal(result.get(0).unwrap().score, 0.6931471805599453, 8),
+                approx_equal(result.get(0).unwrap().score, 0.1823215567939546, 8),
                 true
             );
             assert_eq!(
                 result.get(0).unwrap().key == 1 || result.get(0).unwrap().key == 2,
                 true
             );
             assert_eq!(
-                approx_equal(result.get(1).unwrap().score, 0.6931471805599453, 8),
+                approx_equal(result.get(1).unwrap().score, 0.1823215567939546, 8),
                 true
             );
             assert_eq!(

diff --git a/src/score/default/bm25.rs b/src/score/default/bm25.rs
@@ -127,11 +127,11 @@ mod tests {
             vec![
                 QueryResult {
                     key: 0,
-                    score: 0.6931471805599453,
+                    score: 0.1823215567939546,
                 },
                 QueryResult {
                     key: 1,
-                    score: 0.6931471805599453,
+                    score: 0.1823215567939546,
                 },
             ],
         );