Skip to content

Commit

Permalink
Merge 81fc550 into 67e52f9
Browse files Browse the repository at this point in the history
  • Loading branch information
marcus-pousette committed Aug 20, 2022
2 parents 67e52f9 + 81fc550 commit 348ecc7
Show file tree
Hide file tree
Showing 10 changed files with 55 additions and 183 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ exclude = [".github/**", ".gitignore", ".rustfmt.toml"]

[dependencies]
typed-generational-arena = "0.2"
hashbrown = "0.12.3"

[dev-dependencies]
lazy_static = "1.4.0"
Expand Down
5 changes: 0 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,6 @@ fn description_extract(d: &Doc) -> Option<&str> {
Some(d.description.as_str())
}

// A no-op filter
fn filter(s: &str) -> &str {
s
}

// Create index with 2 fields
let mut index = Index::<usize>::new(2);

Expand Down
6 changes: 1 addition & 5 deletions benches/test_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ struct DocX {
title: String,
}

fn filter(s: &str) -> Cow<'_, str> {
Cow::from(s)
}

fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
s.split(' ').map(Cow::from).collect::<Vec<_>>()
}
Expand Down Expand Up @@ -62,6 +58,6 @@ fn add_all_documents(
id: i,
title: s.to_owned(),
};
index.add_document(extractor, tokenizer, filter, d.id, &d);
index.add_document(extractor, tokenizer, d.id, &d);
}
}
36 changes: 18 additions & 18 deletions src/index.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
use std::{
collections::{HashMap, HashSet},
borrow::Cow,
fmt::{Debug, Formatter},
hash::Hash,
usize,
};

use crate::{FieldAccessor, Filter, Tokenizer};
use crate::{FieldAccessor, Tokenizer};
use hashbrown::{HashMap, HashSet};
extern crate typed_generational_arena;
use typed_generational_arena::StandardArena;
use typed_generational_arena::StandardIndex as ArenaIndex;
Expand Down Expand Up @@ -78,15 +79,15 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
&mut self,
field_accessors: &[FieldAccessor<D>],
tokenizer: Tokenizer,
filter: Filter,
key: T,
doc: &D,
) {
let docs = &mut self.docs;
let fields = &mut self.fields;
let mut field_length = vec![0; fields.len()];
let mut term_counts: HashMap<String, Vec<usize>> = HashMap::new();
let mut all_terms: Vec<String> = Vec::new();
let mut term_counts: HashMap<Cow<str>, Vec<usize>> = HashMap::new();
let mut all_terms: Vec<Cow<str>> = Vec::new();

for i in 0..fields.len() {
if let Some(field_value) = field_accessors[i](doc) {
let fields_len = fields.len();
Expand All @@ -98,14 +99,13 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
// filter and count terms, ignore empty strings
let mut filtered_terms_count = 0;
for term in terms {
let filtered = filter(term.as_ref());
let term = filtered.as_ref().to_owned();
if !term.is_empty() {
all_terms.push(term.clone());
filtered_terms_count += 1;
all_terms.push(term.clone());
let counts = term_counts
.entry(term)
.or_insert_with(|| vec![0; fields_len]);

counts[i] += 1;
}
}
Expand Down Expand Up @@ -456,7 +456,7 @@ fn create_inverted_index_nodes<T: Clone>(
mod tests {
use super::*;

use crate::test_util::{filter, tokenizer};
use crate::test_util::tokenizer;

/// Count the amount of nodes of the index.
///
Expand Down Expand Up @@ -505,7 +505,7 @@ mod tests {
text: "a b c".to_string(),
};

index.add_document(&field_accessors, tokenizer, filter, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, doc.id, &doc);

assert_eq!(index.docs.len(), 1);
let (_, added_doc) = index.docs.iter().next().unwrap();
Expand Down Expand Up @@ -561,9 +561,9 @@ mod tests {
text: "b c d".to_string(),
};

index.add_document(&field_accessors, tokenizer, filter, doc_1.id, &doc_1);
index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1);

index.add_document(&field_accessors, tokenizer, filter, doc_2.id, &doc_2);
index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);

assert_eq!(index.docs.len(), 2);
assert_eq!(
Expand Down Expand Up @@ -617,7 +617,7 @@ mod tests {
text: "a b".to_string(), // double space could introduce empty tokens
};

index.add_document(&field_accessors, tokenizer, filter, doc_1.id, &doc_1);
index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1);
}
}

Expand All @@ -635,7 +635,7 @@ mod tests {
}];

for doc in docs {
index.add_document(&[field_accessor], tokenizer, filter, doc.id, &doc)
index.add_document(&[field_accessor], tokenizer, doc.id, &doc)
}

index.remove_document(1);
Expand Down Expand Up @@ -754,8 +754,8 @@ mod tests {
text: "abe".to_string(),
};

index.add_document(&field_accessors, tokenizer, filter, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, filter, doc_2.id, &doc_2);
index.add_document(&field_accessors, tokenizer, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
assert_eq!(count_nodes(&index), 5); //
}

Expand All @@ -775,8 +775,8 @@ mod tests {
text: "ab ef".to_string(),
};

index.add_document(&field_accessors, tokenizer, filter, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, filter, doc_2.id, &doc_2);
index.add_document(&field_accessors, tokenizer, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
assert_eq!(count_nodes(&index), 7); //
}

Expand Down
19 changes: 3 additions & 16 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@ pub type FieldAccessor<D> = fn(&D) -> Option<&str>;
/// Function used to tokenize a field.
pub type Tokenizer = fn(&str) -> Vec<Cow<'_, str>>;

/// Function used to filter fields.
pub type Filter = fn(&str) -> Cow<'_, str>;

#[cfg(test)]
pub mod test_util {

Expand All @@ -42,28 +39,18 @@ pub mod test_util {
Some(d.text.as_str())
}

pub fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
pub fn tokenizer(s: &str) -> Vec<Cow<str>> {
s.split(' ').map(Cow::from).collect::<Vec<_>>()
}

pub fn filter(s: &str) -> Cow<'_, str> {
Cow::from(s)
}

pub fn test_score<'arena, M, S: ScoreCalculator<usize, M>>(
idx: &mut Index<usize>,
score_calculator: &mut S,
q: &str,
expected: Vec<QueryResult<usize>>,
) {
let fields_len = idx.fields.len();
let mut results = idx.query(
q,
score_calculator,
tokenizer,
filter,
&vec![1.; fields_len],
);
let mut results = idx.query(q, score_calculator, tokenizer, &vec![1.; fields_len]);
results.sort_by(|a, b| {
let mut sort = b.score.partial_cmp(&a.score).unwrap();
sort = sort.then_with(|| a.key.partial_cmp(&b.key).unwrap());
Expand All @@ -90,7 +77,7 @@ pub mod test_util {
title: title.to_string(),
text: String::new(),
};
index.add_document(&[title_extract], tokenizer, filter, doc.id, &doc);
index.add_document(&[title_extract], tokenizer, doc.id, &doc);
}
index
}
Expand Down
Loading

0 comments on commit 348ecc7

Please sign in to comment.