Skip to content

Commit

Permalink
refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
marcus-pousette committed Aug 20, 2022
1 parent a25f1b0 commit 81fc550
Show file tree
Hide file tree
Showing 8 changed files with 55 additions and 134 deletions.
2 changes: 1 addition & 1 deletion benches/test_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,6 @@ fn add_all_documents(
id: i,
title: s.to_owned(),
};
index.add_document(extractor, tokenizer, d.id, &d);
index.add_document(extractor, tokenizer, d.id, &d);
}
}
59 changes: 24 additions & 35 deletions src/index.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
use std::{
borrow::Cow,
fmt::{Debug, Formatter},
hash::Hash,
usize, borrow::Cow

usize,
};

use hashbrown::{HashMap, HashSet};
use crate::{FieldAccessor, Tokenizer};
use hashbrown::{HashMap, HashSet};
extern crate typed_generational_arena;
use typed_generational_arena::StandardArena;
use typed_generational_arena::StandardIndex as ArenaIndex;
Expand Down Expand Up @@ -74,21 +74,20 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
self.removed.as_ref()
}


/// Adds a document to the index.
pub fn add_document<D>(
&mut self,
field_accessors: &[FieldAccessor<D>],
tokenizer: Tokenizer,
key: T,
doc: & D,
doc: &D,
) {
let docs = &mut self.docs;
let fields = &mut self.fields;
let mut field_length = vec![0; fields.len()];
let mut term_counts: HashMap<Cow<str>, Vec<usize>> = HashMap::new();
let mut all_terms: Vec<Cow<str>> = Vec::new();

for i in 0..fields.len() {
if let Some(field_value) = field_accessors[i](doc) {
let fields_len = fields.len();
Expand All @@ -100,25 +99,15 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
// filter and count terms, ignore empty strings
let mut filtered_terms_count = 0;
for term in terms {
if !term.is_empty() {
if !term.is_empty() {
filtered_terms_count += 1;
/* let counts = term_counts
.entry(term.clone())
.or_insert_with(|| vec![0; fields_len]); */

let counts = match term_counts.get_mut(&term)
{
Some(counts) => {
counts},
None => {
term_counts.insert(term.clone(), vec![0; fields_len]);
term_counts.get_mut(&term).unwrap()
}
};
counts[i] += 1;
all_terms.push(term);

}
all_terms.push(term.clone());
let counts = term_counts
.entry(term)
.or_insert_with(|| vec![0; fields_len]);

counts[i] += 1;
}
}

field_details.sum += filtered_terms_count;
Expand Down Expand Up @@ -467,7 +456,7 @@ fn create_inverted_index_nodes<T: Clone>(
mod tests {
use super::*;

use crate::test_util::{ tokenizer};
use crate::test_util::tokenizer;

/// Count the amount of nodes of the index.
///
Expand Down Expand Up @@ -516,7 +505,7 @@ mod tests {
text: "a b c".to_string(),
};

index.add_document(&field_accessors, tokenizer, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, doc.id, &doc);

assert_eq!(index.docs.len(), 1);
let (_, added_doc) = index.docs.iter().next().unwrap();
Expand Down Expand Up @@ -572,9 +561,9 @@ mod tests {
text: "b c d".to_string(),
};

index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1);
index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1);

index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);

assert_eq!(index.docs.len(), 2);
assert_eq!(
Expand All @@ -598,7 +587,7 @@ mod tests {
assert_eq!(&root.char, &char::from_u32(0).unwrap());
assert_eq!(&root.next.is_none(), &true);
assert_eq!(&root.first_doc.is_none(), &true);

let first_child = index.arena_index.get(root.first_child.unwrap()).unwrap();
assert_eq!(&first_child.char, &char::from_u32(100).unwrap());
assert_eq!(&first_child.first_child.is_none(), &true);
Expand Down Expand Up @@ -628,7 +617,7 @@ mod tests {
text: "a b".to_string(), // double space could introduce empty tokens
};

index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1);
index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1);
}
}

Expand All @@ -646,7 +635,7 @@ mod tests {
}];

for doc in docs {
index.add_document(&[field_accessor], tokenizer, doc.id, &doc)
index.add_document(&[field_accessor], tokenizer, doc.id, &doc)
}

index.remove_document(1);
Expand Down Expand Up @@ -765,8 +754,8 @@ mod tests {
text: "abe".to_string(),
};

index.add_document(&field_accessors, tokenizer, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
index.add_document(&field_accessors, tokenizer, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
assert_eq!(count_nodes(&index), 5); //
}

Expand All @@ -786,8 +775,8 @@ mod tests {
text: "ab ef".to_string(),
};

index.add_document(&field_accessors, tokenizer, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
index.add_document(&field_accessors, tokenizer, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
assert_eq!(count_nodes(&index), 7); //
}

Expand Down
19 changes: 7 additions & 12 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ pub use index::*;
pub use query::QueryResult;

/// Function that extracts a field value from a document.
pub type FieldAccessor< D> = fn(& D) -> Option<&str>;
pub type FieldAccessor<D> = fn(&D) -> Option<&str>;

/// Function used to tokenize a field.
pub type Tokenizer = fn( &str) -> Vec<Cow<'_, str>>;
pub type Tokenizer = fn(&str) -> Vec<Cow<'_, str>>;

#[cfg(test)]
pub mod test_util {
Expand All @@ -31,15 +31,15 @@ pub mod test_util {
pub text: String,
}

pub fn title_extract<'a>(d: &'a Doc) -> Option<&'a str> {
pub fn title_extract(d: &Doc) -> Option<&str> {
Some(d.title.as_str())
}

pub fn text_extract<'a>(d: &'a Doc) -> Option<&'a str> {
pub fn text_extract(d: &Doc) -> Option<&str> {
Some(d.text.as_str())
}

pub fn tokenizer<'a>(s: &'a str) -> Vec<Cow<'a, str>> {
pub fn tokenizer(s: &str) -> Vec<Cow<str>> {
s.split(' ').map(Cow::from).collect::<Vec<_>>()
}

Expand All @@ -50,12 +50,7 @@ pub mod test_util {
expected: Vec<QueryResult<usize>>,
) {
let fields_len = idx.fields.len();
let mut results = idx.query(
q,
score_calculator,
tokenizer,
&vec![1.; fields_len],
);
let mut results = idx.query(q, score_calculator, tokenizer, &vec![1.; fields_len]);
results.sort_by(|a, b| {
let mut sort = b.score.partial_cmp(&a.score).unwrap();
sort = sort.then_with(|| a.key.partial_cmp(&b.key).unwrap());
Expand All @@ -82,7 +77,7 @@ pub mod test_util {
title: title.to_string(),
text: String::new(),
};
index.add_document(&[title_extract], tokenizer, doc.id, &doc);
index.add_document(&[title_extract], tokenizer, doc.id, &doc);
}
index
}
Expand Down
69 changes: 13 additions & 56 deletions src/query.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
use std::{
fmt::Debug,
hash::Hash,
};
use hashbrown::{HashMap, HashSet};
use std::{fmt::Debug, hash::Hash};

use typed_generational_arena::StandardArena;

use crate::{score::*, Index, InvertedIndexNode, Tokenizer};
use crate::{score::*, Index, InvertedIndexNode, Tokenizer};

/// Result type for querying an index.
#[derive(Debug, PartialEq)]
Expand All @@ -23,17 +20,17 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
/// All token separators work as a disjunction operator.
pub fn query<'a, M, S: ScoreCalculator<T, M>>(
&self,
query: &'a str,
query: &'a str,
score_calculator: &mut S,
tokenizer: Tokenizer,
fields_boost: &[f64],
) -> Vec<QueryResult<T>> {
let removed = self.removed_documents();
let query_terms = tokenizer(query);/* .iter().map(|term| term.to_string()).collect() */
let query_terms = tokenizer(query); /* .iter().map(|term| term.to_string()).collect() */

let mut scores = HashMap::new();
let query_terms_len = query_terms.len();

for (query_term_index, query_term) in query_terms.iter().enumerate() {
if !query_term.is_empty() {
let expanded_terms = self.expand_term(query_term.as_ref(), &self.arena_index);
Expand Down Expand Up @@ -94,7 +91,7 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
}
}
}
}
}
}

let mut result = Vec::new();
Expand Down Expand Up @@ -171,7 +168,6 @@ pub(crate) mod tests {

use crate::test_util::*;
use crate::Index;
use std::borrow::Cow;

fn approx_equal(a: f64, b: f64, dp: u8) -> bool {
let p: f64 = 10f64.powf(-(dp as f64));
Expand All @@ -198,18 +194,12 @@ pub(crate) mod tests {
},
];
for doc in docs {
index.add_document(
&[title_extract, text_extract],
tokenizer,
doc.id,
&doc,
);
index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc);
}
let result = index.query(
&"a".to_string(),
&mut crate::score::bm25::new(),
tokenizer,

&[1., 1.],
);
assert_eq!(result.len(), 1);
Expand Down Expand Up @@ -237,20 +227,13 @@ pub(crate) mod tests {
];

for doc in docs {
index.add_document(
&[title_extract, text_extract],
tokenizer,

doc.id,
&doc,
);
index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc);
}

let result = index.query(
&"c".to_string(),
&mut crate::score::bm25::new(),
tokenizer,

&[1., 1.],
);

Expand Down Expand Up @@ -291,20 +274,13 @@ pub(crate) mod tests {
];

for doc in docs {
index.add_document(
&[title_extract, text_extract],
tokenizer,

doc.id,
&doc,
);
index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc);
}

let result = index.query(
&"h".to_string(),
&mut crate::score::bm25::new(),
tokenizer,

&[1., 1.],
);
assert_eq!(result.len(), 1);
Expand Down Expand Up @@ -332,20 +308,13 @@ pub(crate) mod tests {
];

for doc in docs {
index.add_document(
&[title_extract, text_extract],
tokenizer,

doc.id,
&doc,
);
index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc);
}

let result = index.query(
&"a d".to_string(),
&mut crate::score::bm25::new(),
tokenizer,

&[1., 1.],
);
assert_eq!(result.len(), 2);
Expand Down Expand Up @@ -388,13 +357,7 @@ pub(crate) mod tests {
];

for doc in docs {
index.add_document(
&[title_extract, text_extract],
tokenizer,

doc.id,
&doc,
);
index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc);
}
let exp = index.expand_term(&"a".to_string(), &index.arena_index);
assert_eq!(exp, vec!["adef".to_string(), "abc".to_string()]);
Expand All @@ -417,13 +380,7 @@ pub(crate) mod tests {
];

for doc in docs {
index.add_document(
&[title_extract, text_extract],
tokenizer,

doc.id,
&doc,
);
index.add_document(&[title_extract, text_extract], tokenizer, doc.id, &doc);
}
let exp = index.expand_term(&"x".to_string(), &index.arena_index);
assert_eq!(exp, Vec::new() as Vec<String>);
Expand Down
Loading

0 comments on commit 81fc550

Please sign in to comment.