Skip to content

Commit

Permalink
Use Cow for tokenizer.
Browse files Browse the repository at this point in the history
  • Loading branch information
tmpfs committed Aug 20, 2022
1 parent 0af49e8 commit 2e35ea6
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 20 deletions.
5 changes: 3 additions & 2 deletions benches/test_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ struct DocX {
fn filter(s: &str) -> Cow<'_, str> {
Cow::from(s)
}
fn tokenizer(s: &str) -> Vec<&str> {
s.split(' ').collect::<Vec<_>>()

fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
s.split(' ').map(Cow::from).collect::<Vec<_>>()
}

pub fn test_speed(c: &mut Criterion) {
Expand Down
17 changes: 6 additions & 11 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,14 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
// filter and count terms, ignore empty strings
let mut filtered_terms_count = 0;
for term in terms {
let filtered = filter(term);
let filtered = filter(term.as_ref());
let term = filtered.as_ref().to_owned();
if !term.is_empty() {
all_terms.push(term.clone());
filtered_terms_count += 1;
let counts = term_counts.entry(term)
.or_insert(vec![0; fields_len]);
let counts = term_counts
.entry(term)
.or_insert_with(|| vec![0; fields_len]);
counts[i] += 1;
}
}
Expand Down Expand Up @@ -448,7 +449,8 @@ fn create_inverted_index_nodes<T: Clone>(
#[cfg(test)]
mod tests {
use super::*;
use std::borrow::Cow;

use crate::test_util::{filter, tokenizer};

/// Count the amount of nodes of the index.
///
Expand Down Expand Up @@ -478,13 +480,6 @@ mod tests {
text: String,
}

fn tokenizer(s: &str) -> Vec<&str> {
s.split(' ').collect::<Vec<_>>()
}

fn filter(s: &str) -> Cow<'_, str> {
Cow::from(s)
}
fn field_accessor(doc: &Doc) -> Option<&str> {
Some(doc.text.as_str())
}
Expand Down
6 changes: 3 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pub use query::QueryResult;
pub type FieldAccessor<D> = fn(&D) -> Option<&str>;

/// Function used to tokenize a field.
pub type Tokenizer = fn(&str) -> Vec<&str>;
pub type Tokenizer = fn(&str) -> Vec<Cow<'_, str>>;

/// Function used to filter fields.
pub type Filter = fn(&str) -> Cow<'_, str>;
Expand Down Expand Up @@ -42,8 +42,8 @@ pub mod test_util {
Some(d.text.as_str())
}

pub fn tokenizer(s: &str) -> Vec<&str> {
s.split(' ').collect::<Vec<_>>()
pub fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
s.split(' ').map(Cow::from).collect::<Vec<_>>()
}

pub fn filter(s: &str) -> Cow<'_, str> {
Expand Down
4 changes: 2 additions & 2 deletions src/score/calculator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::{
index::{DocumentDetails, DocumentPointer, FieldDetails, InvertedIndexNode},
QueryResult,
};
use std::{collections::HashMap, fmt::Debug};
use std::{borrow::Cow, collections::HashMap, fmt::Debug};
use typed_generational_arena::StandardIndex as ArenaIndex;

pub struct TermData<'a> {
Expand All @@ -14,7 +14,7 @@ pub struct TermData<'a> {
// from the current query term `query_term`
pub query_term_expanded: &'a str,
// All available query terms
pub all_query_terms: Vec<&'a str>,
pub all_query_terms: Vec<Cow<'a, str>>,
}

pub struct FieldData<'a> {
Expand Down
5 changes: 3 additions & 2 deletions tests/integrations_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@ struct Doc {
description: String,
}

fn tokenizer(s: &str) -> Vec<&str> {
s.split(' ').collect::<Vec<_>>()
fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
s.split(' ').map(Cow::from).collect::<Vec<_>>()
}

fn title_extract(d: &Doc) -> Option<&str> {
Some(d.title.as_str())
}
Expand Down

0 comments on commit 2e35ea6

Please sign in to comment.