diff --git a/README.md b/README.md index c395815..8e536f6 100644 --- a/README.md +++ b/README.md @@ -55,13 +55,13 @@ fn tokenizer(s: &str) -> Vec> { // We have to provide extraction functions for the fields we want to index // Title -fn title_extract(d: &Doc) -> Option<&str> { - Some(d.title.as_str()) +fn title_extract(d: &Doc) -> Vec<&str> { + vec![d.title.as_str()] } // Description -fn description_extract(d: &Doc) -> Option<&str> { - Some(d.description.as_str()) +fn description_extract(d: &Doc) -> Vec<&str> { + vec![d.description.as_str()] } // Create index with 2 fields diff --git a/benches/test_benchmark.rs b/benches/test_benchmark.rs index ddf2e4d..9946496 100644 --- a/benches/test_benchmark.rs +++ b/benches/test_benchmark.rs @@ -30,8 +30,8 @@ pub fn test_speed(c: &mut Criterion) { } s } - fn title_extract_x(d: &DocX) -> Option<&str> { - Some(d.title.as_str()) + fn title_extract_x(d: &DocX) -> Vec<&str> { + vec![d.title.as_str()] } c.bench_function("add_100k_docs", |b| { @@ -43,14 +43,14 @@ pub fn test_speed(c: &mut Criterion) { new_rand.push_str(&generate_string(0, 4)); random_strings.push(new_rand); } - let extractor = [title_extract_x as fn(&_) -> Option<&str>]; + let extractor = [title_extract_x as fn(&DocX) -> Vec<&str>]; b.iter(|| add_all_documents(&mut index, &extractor, &random_strings)); }); } fn add_all_documents( index: &mut Index, - extractor: &[fn(&DocX) -> Option<&str>], + extractor: &[fn(&DocX) -> Vec<&str>], random_strings: &[String], ) { for (i, s) in random_strings.iter().enumerate() { diff --git a/src/index.rs b/src/index.rs index c235097..996c104 100644 --- a/src/index.rs +++ b/src/index.rs @@ -89,10 +89,10 @@ impl Index { let mut all_terms: Vec> = Vec::new(); for i in 0..fields.len() { - if let Some(field_value) = field_accessors[i](doc) { - let fields_len = fields.len(); - let mut field_details = fields.get_mut(i).unwrap(); - + let field_values = field_accessors[i](doc); + let fields_len = fields.len(); + let mut field_details = fields.get_mut(i).unwrap(); + for field_value in field_values { // tokenize text let terms = tokenizer(field_value); @@ -170,7 +170,7 @@ impl Index { let doc_details_option = self.docs.get(&key); let mut remove_key = false; if let Some(doc_details) = doc_details_option { - removed.insert((&key).to_owned()); + removed.insert(key); let details = doc_details; remove_key = true; let new_len = (self.docs.len() - 1) as f64; @@ -486,8 +486,8 @@ mod tests { text: String, } - fn field_accessor(doc: &Doc) -> Option<&str> { - Some(doc.text.as_str()) + fn field_accessor(doc: &Doc) -> Vec<&str> { + vec![doc.text.as_str()] } mod add { @@ -496,8 +496,7 @@ mod tests { #[test] fn it_should_add_one_document_with_three_terms<'idn>() { - let field_accessors: Vec> = - vec![field_accessor as fn(doc: &Doc) -> Option<&str>]; + let field_accessors: Vec> = vec![field_accessor]; let mut index = Index::::new(1); let doc = Doc { @@ -548,8 +547,7 @@ mod tests { #[test] fn it_should_add_shared_terms() { - let field_accessors: Vec> = - vec![field_accessor as fn(doc: &Doc) -> Option<&str>]; + let field_accessors: Vec> = vec![field_accessor]; let mut index = Index::::new(1); let doc_1 = Doc { @@ -608,8 +606,7 @@ mod tests { #[test] fn it_should_ignore_empty_tokens() { - let field_accessors: Vec> = - vec![field_accessor as fn(doc: &Doc) -> Option<&str>]; + let field_accessors: Vec> = vec![field_accessor]; let mut index = Index::::new(1); let doc_1 = Doc { @@ -742,7 +739,7 @@ mod tests { #[test] fn it_should_count_nodes() { let field_accessors: Vec> = - vec![field_accessor as fn(doc: &Doc) -> Option<&str>]; + vec![field_accessor as fn(doc: &Doc) -> Vec<&str>]; let mut index = Index::::new(1); let doc = Doc { @@ -761,8 +758,7 @@ mod tests { #[test] fn it_should_count_nodes_2() { - let field_accessors: Vec> = - vec![field_accessor as fn(doc: &Doc) -> Option<&str>]; + let field_accessors: Vec> = vec![field_accessor]; let mut index = Index::::new(1); diff --git a/src/lib.rs b/src/lib.rs index f0db0d3..14f26ef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,7 @@ pub use index::*; pub use query::QueryResult; /// Function that extracts a field value from a document. -pub type FieldAccessor = fn(&D) -> Option<&str>; +pub type FieldAccessor = fn(&D) -> Vec<&str>; /// Function used to tokenize a field. pub type Tokenizer = fn(&str) -> Vec>; @@ -31,12 +31,12 @@ pub mod test_util { pub text: String, } - pub fn title_extract(d: &Doc) -> Option<&str> { - Some(d.title.as_str()) + pub fn title_extract(d: &Doc) -> Vec<&str> { + vec![d.title.as_str()] } - pub fn text_extract(d: &Doc) -> Option<&str> { - Some(d.text.as_str()) + pub fn text_extract(d: &Doc) -> Vec<&str> { + vec![d.text.as_str()] } pub fn tokenizer(s: &str) -> Vec> { diff --git a/src/score/default/bm25.rs b/src/score/default/bm25.rs index 4882af3..80ed794 100644 --- a/src/score/default/bm25.rs +++ b/src/score/default/bm25.rs @@ -71,7 +71,7 @@ impl ScoreCalculator for BM25 { let pre_calculations = &before_output.unwrap(); // it will exist as we need BM25 parameters let mut score: f64 = 0_f64; for x in 0..document_details.field_length.len() { - let mut tf = (&document_pointer.term_frequency[x]).to_owned() as f64; + let mut tf = document_pointer.term_frequency[x] as f64; if tf > 0_f64 { // calculating BM25 tf let field_length = &document_details.field_length[x]; @@ -80,8 +80,7 @@ impl ScoreCalculator for BM25 { tf = ((self.bm25k1 + 1_f64) * tf) / (self.bm25k1 * ((1_f64 - self.bm25b) - + self.bm25b - * (field_length.to_owned() as f64 / avg_field_length as f64)) + + self.bm25b * (field_length.to_owned() as f64 / avg_field_length)) + tf); score += tf * pre_calculations.idf diff --git a/src/score/default/zero_to_one.rs b/src/score/default/zero_to_one.rs index 4a70020..1234bac 100644 --- a/src/score/default/zero_to_one.rs +++ b/src/score/default/zero_to_one.rs @@ -315,11 +315,11 @@ mod tests { title: String, description: String, } - fn title_extract(doc: &DocTitleDescription) -> Option<&str> { - Some(doc.title.as_str()) + fn title_extract(doc: &DocTitleDescription) -> Vec<&str> { + vec![doc.title.as_str()] } - fn description_extract(doc: &DocTitleDescription) -> Option<&str> { - Some(doc.description.as_str()) + fn description_extract(doc: &DocTitleDescription) -> Vec<&str> { + vec![doc.description.as_str()] } for (i, (title, description)) in titles.iter().zip(descriptions.iter()).enumerate() { @@ -364,11 +364,11 @@ mod tests { title: String, description: String, } - fn title_extract(doc: &DocTitleDescription) -> Option<&str> { - Some(doc.title.as_str()) + fn title_extract(doc: &DocTitleDescription) -> Vec<&str> { + vec![doc.title.as_str()] } - fn description_extract(doc: &DocTitleDescription) -> Option<&str> { - Some(doc.description.as_str()) + fn description_extract(doc: &DocTitleDescription) -> Vec<&str> { + vec![doc.description.as_str()] } for (i, (title, description)) in titles.iter().zip(descriptions.iter()).enumerate() { diff --git a/tests/integrations_tests.rs b/tests/integrations_tests.rs index 7068fd5..debefd7 100644 --- a/tests/integrations_tests.rs +++ b/tests/integrations_tests.rs @@ -16,12 +16,12 @@ fn tokenizer(s: &str) -> Vec> { s.split(' ').map(Cow::from).collect::>() } -fn title_extract(d: &Doc) -> Option<&str> { - Some(d.title.as_str()) +fn title_extract(d: &Doc) -> Vec<&str> { + vec![d.title.as_str()] } -fn description_extract(d: &Doc) -> Option<&str> { - Some(d.description.as_str()) +fn description_extract(d: &Doc) -> Vec<&str> { + vec![d.description.as_str()] } #[test]