Skip to content

Commit

Permalink
Merge pull request #21 from tmpfs/multi-field
Browse files Browse the repository at this point in the history
Allow extraction from fields that may be collections.
  • Loading branch information
marcus-pousette committed Jan 5, 2023
2 parents 3767aa6 + ce881db commit 8d215e4
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 44 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,13 @@ fn tokenizer(s: &str) -> Vec<Cow<str>> {
// We have to provide extraction functions for the fields we want to index

// Title
fn title_extract(d: &Doc) -> Option<&str> {
Some(d.title.as_str())
fn title_extract(d: &Doc) -> Vec<&str> {
vec![d.title.as_str()]
}

// Description
fn description_extract(d: &Doc) -> Option<&str> {
Some(d.description.as_str())
fn description_extract(d: &Doc) -> Vec<&str> {
vec![d.description.as_str()]
}

// Create index with 2 fields
Expand Down
8 changes: 4 additions & 4 deletions benches/test_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ pub fn test_speed(c: &mut Criterion) {
}
s
}
fn title_extract_x(d: &DocX) -> Option<&str> {
Some(d.title.as_str())
fn title_extract_x(d: &DocX) -> Vec<&str> {
vec![d.title.as_str()]
}

c.bench_function("add_100k_docs", |b| {
Expand All @@ -43,14 +43,14 @@ pub fn test_speed(c: &mut Criterion) {
new_rand.push_str(&generate_string(0, 4));
random_strings.push(new_rand);
}
let extractor = [title_extract_x as fn(&_) -> Option<&str>];
let extractor = [title_extract_x as fn(&DocX) -> Vec<&str>];
b.iter(|| add_all_documents(&mut index, &extractor, &random_strings));
});
}

fn add_all_documents(
index: &mut Index<usize>,
extractor: &[fn(&DocX) -> Option<&str>],
extractor: &[fn(&DocX) -> Vec<&str>],
random_strings: &[String],
) {
for (i, s) in random_strings.iter().enumerate() {
Expand Down
28 changes: 12 additions & 16 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
let mut all_terms: Vec<Cow<str>> = Vec::new();

for i in 0..fields.len() {
if let Some(field_value) = field_accessors[i](doc) {
let fields_len = fields.len();
let mut field_details = fields.get_mut(i).unwrap();

let field_values = field_accessors[i](doc);
let fields_len = fields.len();
let mut field_details = fields.get_mut(i).unwrap();
for field_value in field_values {
// tokenize text
let terms = tokenizer(field_value);

Expand Down Expand Up @@ -170,7 +170,7 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
let doc_details_option = self.docs.get(&key);
let mut remove_key = false;
if let Some(doc_details) = doc_details_option {
removed.insert((&key).to_owned());
removed.insert(key);
let details = doc_details;
remove_key = true;
let new_len = (self.docs.len() - 1) as f64;
Expand Down Expand Up @@ -486,8 +486,8 @@ mod tests {
text: String,
}

fn field_accessor(doc: &Doc) -> Option<&str> {
Some(doc.text.as_str())
fn field_accessor(doc: &Doc) -> Vec<&str> {
vec![doc.text.as_str()]
}

mod add {
Expand All @@ -496,8 +496,7 @@ mod tests {

#[test]
fn it_should_add_one_document_with_three_terms<'idn>() {
let field_accessors: Vec<FieldAccessor<Doc>> =
vec![field_accessor as fn(doc: &Doc) -> Option<&str>];
let field_accessors: Vec<FieldAccessor<Doc>> = vec![field_accessor];

let mut index = Index::<usize>::new(1);
let doc = Doc {
Expand Down Expand Up @@ -548,8 +547,7 @@ mod tests {

#[test]
fn it_should_add_shared_terms() {
let field_accessors: Vec<FieldAccessor<Doc>> =
vec![field_accessor as fn(doc: &Doc) -> Option<&str>];
let field_accessors: Vec<FieldAccessor<Doc>> = vec![field_accessor];

let mut index = Index::<usize>::new(1);
let doc_1 = Doc {
Expand Down Expand Up @@ -608,8 +606,7 @@ mod tests {

#[test]
fn it_should_ignore_empty_tokens() {
let field_accessors: Vec<FieldAccessor<Doc>> =
vec![field_accessor as fn(doc: &Doc) -> Option<&str>];
let field_accessors: Vec<FieldAccessor<Doc>> = vec![field_accessor];

let mut index = Index::<usize>::new(1);
let doc_1 = Doc {
Expand Down Expand Up @@ -742,7 +739,7 @@ mod tests {
#[test]
fn it_should_count_nodes() {
let field_accessors: Vec<FieldAccessor<Doc>> =
vec![field_accessor as fn(doc: &Doc) -> Option<&str>];
vec![field_accessor as fn(doc: &Doc) -> Vec<&str>];

let mut index = Index::<usize>::new(1);
let doc = Doc {
Expand All @@ -761,8 +758,7 @@ mod tests {

#[test]
fn it_should_count_nodes_2() {
let field_accessors: Vec<FieldAccessor<Doc>> =
vec![field_accessor as fn(doc: &Doc) -> Option<&str>];
let field_accessors: Vec<FieldAccessor<Doc>> = vec![field_accessor];

let mut index = Index::<usize>::new(1);

Expand Down
10 changes: 5 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ pub use index::*;
pub use query::QueryResult;

/// Function that extracts a field value from a document.
pub type FieldAccessor<D> = fn(&D) -> Option<&str>;
pub type FieldAccessor<D> = fn(&D) -> Vec<&str>;

/// Function used to tokenize a field.
pub type Tokenizer = fn(&str) -> Vec<Cow<'_, str>>;
Expand All @@ -31,12 +31,12 @@ pub mod test_util {
pub text: String,
}

pub fn title_extract(d: &Doc) -> Option<&str> {
Some(d.title.as_str())
pub fn title_extract(d: &Doc) -> Vec<&str> {
vec![d.title.as_str()]
}

pub fn text_extract(d: &Doc) -> Option<&str> {
Some(d.text.as_str())
pub fn text_extract(d: &Doc) -> Vec<&str> {
vec![d.text.as_str()]
}

pub fn tokenizer(s: &str) -> Vec<Cow<str>> {
Expand Down
5 changes: 2 additions & 3 deletions src/score/default/bm25.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ impl<T: Debug> ScoreCalculator<T, BM25TermCalculations> for BM25 {
let pre_calculations = &before_output.unwrap(); // it will exist as we need BM25 parameters
let mut score: f64 = 0_f64;
for x in 0..document_details.field_length.len() {
let mut tf = (&document_pointer.term_frequency[x]).to_owned() as f64;
let mut tf = document_pointer.term_frequency[x] as f64;
if tf > 0_f64 {
// calculating BM25 tf
let field_length = &document_details.field_length[x];
Expand All @@ -80,8 +80,7 @@ impl<T: Debug> ScoreCalculator<T, BM25TermCalculations> for BM25 {
tf = ((self.bm25k1 + 1_f64) * tf)
/ (self.bm25k1
* ((1_f64 - self.bm25b)
+ self.bm25b
* (field_length.to_owned() as f64 / avg_field_length as f64))
+ self.bm25b * (field_length.to_owned() as f64 / avg_field_length))
+ tf);
score += tf
* pre_calculations.idf
Expand Down
16 changes: 8 additions & 8 deletions src/score/default/zero_to_one.rs
Original file line number Diff line number Diff line change
Expand Up @@ -315,11 +315,11 @@ mod tests {
title: String,
description: String,
}
fn title_extract(doc: &DocTitleDescription) -> Option<&str> {
Some(doc.title.as_str())
fn title_extract(doc: &DocTitleDescription) -> Vec<&str> {
vec![doc.title.as_str()]
}
fn description_extract(doc: &DocTitleDescription) -> Option<&str> {
Some(doc.description.as_str())
fn description_extract(doc: &DocTitleDescription) -> Vec<&str> {
vec![doc.description.as_str()]
}

for (i, (title, description)) in titles.iter().zip(descriptions.iter()).enumerate() {
Expand Down Expand Up @@ -364,11 +364,11 @@ mod tests {
title: String,
description: String,
}
fn title_extract(doc: &DocTitleDescription) -> Option<&str> {
Some(doc.title.as_str())
fn title_extract(doc: &DocTitleDescription) -> Vec<&str> {
vec![doc.title.as_str()]
}
fn description_extract(doc: &DocTitleDescription) -> Option<&str> {
Some(doc.description.as_str())
fn description_extract(doc: &DocTitleDescription) -> Vec<&str> {
vec![doc.description.as_str()]
}

for (i, (title, description)) in titles.iter().zip(descriptions.iter()).enumerate() {
Expand Down
8 changes: 4 additions & 4 deletions tests/integrations_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
s.split(' ').map(Cow::from).collect::<Vec<_>>()
}

fn title_extract(d: &Doc) -> Option<&str> {
Some(d.title.as_str())
fn title_extract(d: &Doc) -> Vec<&str> {
vec![d.title.as_str()]
}

fn description_extract(d: &Doc) -> Option<&str> {
Some(d.description.as_str())
fn description_extract(d: &Doc) -> Vec<&str> {
vec![d.description.as_str()]
}

#[test]
Expand Down

0 comments on commit 8d215e4

Please sign in to comment.