Skip to content

Commit

Permalink
Merge 68fd90a into a41b743
Browse files Browse the repository at this point in the history
  • Loading branch information
tmpfs committed Aug 12, 2022
2 parents a41b743 + 68fd90a commit e161b54
Show file tree
Hide file tree
Showing 10 changed files with 99 additions and 83 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "probly-search"
description = "A lightweight full-text search engine with a fully customizable scoring function"
version = "1.2.4"
version = "2.0.0-alpha-1"
authors = ["marcus-pousette <marcus.pousette@quantleaf.com>"]
edition = "2018"
license = "MIT"
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ add_document_to_index(
tokenizer,
filter,
doc_1.id,
doc_1.clone(),
&doc_1,
);

add_document_to_index(
Expand All @@ -104,7 +104,7 @@ add_document_to_index(
tokenizer,
filter,
doc_2.id,
doc_2,
&doc_2,
);

// Search, expected 2 results
Expand Down
12 changes: 5 additions & 7 deletions benches/test_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@ struct DocX {
title: String,
}

fn filter(s: &str) -> String {
s.to_owned()
fn filter(s: &str) -> &str {
s
}
fn tokenizer(s: &str) -> Vec<String> {
s.split(' ')
.map(|slice| slice.to_owned())
.collect::<Vec<String>>()
fn tokenizer(s: &str) -> Vec<&str> {
s.split(' ').collect::<Vec<_>>()
}

pub fn test_speed(c: &mut Criterion) {
Expand Down Expand Up @@ -62,6 +60,6 @@ fn add_all_documents(
id: i,
title: s.to_owned(),
};
add_document_to_index(&mut index, extractor, tokenizer, filter, d.id, d);
add_document_to_index(&mut index, extractor, tokenizer, filter, d.id, &d);
}
}
61 changes: 40 additions & 21 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ Document Details object stores additional information about documents.
* typeparam `T` Document key.
*/

#[derive(Debug, PartialEq)]
#[derive(Debug, PartialEq, Eq)]
pub struct DocumentDetails<T> {
/**
Document key. It can be a simple unique ID or a direct reference to original document.
Expand Down Expand Up @@ -290,15 +290,15 @@ pub fn add_document_to_index<T: Eq + Hash + Copy, D>(
tokenizer: Tokenizer,
filter: Filter,
key: T,
doc: D,
doc: &D,
) {
let docs = &mut index.docs;
let fields = &mut index.fields;
let mut field_length = vec![0; fields.len()];
let mut term_counts: HashMap<String, Vec<usize>> = HashMap::new();
let mut all_terms: Vec<String> = Vec::new();
for i in 0..fields.len() {
if let Some(field_value) = field_accessors[i](&doc) {
if let Some(field_value) = field_accessors[i](doc) {
let fields_len = fields.len();
let mut field_details = fields.get_mut(i).unwrap();

Expand All @@ -308,16 +308,16 @@ pub fn add_document_to_index<T: Eq + Hash + Copy, D>(
// filter and count terms, ignore empty strings
let mut filtered_terms_count = 0;
for mut term in terms {
term = filter(&term);
term = filter(term);
if !term.is_empty() {
all_terms.push(term.to_owned());
filtered_terms_count += 1;
let counts = term_counts.get_mut(&term);
let counts = term_counts.get_mut(term);
match counts {
None => {
let mut new_count = vec![0; fields_len];
new_count[i] += 1;
term_counts.insert(term, new_count);
term_counts.insert(term.to_owned(), new_count);
}
Some(c) => {
c[i] += 1;
Expand Down Expand Up @@ -565,14 +565,12 @@ mod tests {
text: String,
}

fn tokenizer(s: &str) -> Vec<String> {
s.split(' ')
.map(|slice| slice.to_owned())
.collect::<Vec<String>>()
fn tokenizer(s: &str) -> Vec<&str> {
s.split(' ').collect::<Vec<_>>()
}

fn filter(s: &str) -> String {
s.to_owned()
fn filter(s: &str) -> &str {
s
}
fn field_accessor(doc: &Doc) -> Option<&str> {
Some(doc.text.as_str())
Expand All @@ -593,7 +591,14 @@ mod tests {
text: "a b c".to_string(),
};

add_document_to_index(&mut index, &field_accessors, tokenizer, filter, doc.id, doc);
add_document_to_index(
&mut index,
&field_accessors,
tokenizer,
filter,
doc.id,
&doc,
);

assert_eq!(index.docs.len(), 1);
let (_, added_doc) = index.docs.iter().next().unwrap();
Expand Down Expand Up @@ -655,7 +660,7 @@ mod tests {
tokenizer,
filter,
doc_1.id,
doc_1.clone(),
&doc_1,
);

add_document_to_index(
Expand All @@ -664,7 +669,7 @@ mod tests {
tokenizer,
filter,
doc_2.id,
doc_2.clone(),
&doc_2,
);

assert_eq!(index.docs.len(), 2);
Expand Down Expand Up @@ -725,7 +730,7 @@ mod tests {
tokenizer,
filter,
doc_1.id,
doc_1,
&doc_1,
);
}
}
Expand All @@ -751,7 +756,7 @@ mod tests {
tokenizer,
filter,
doc.id,
doc,
&doc,
)
}

Expand Down Expand Up @@ -871,14 +876,21 @@ mod tests {
text: "abe".to_string(),
};

add_document_to_index(&mut index, &field_accessors, tokenizer, filter, doc.id, doc);
add_document_to_index(
&mut index,
&field_accessors,
tokenizer,
filter,
doc.id,
&doc,
);
add_document_to_index(
&mut index,
&field_accessors,
tokenizer,
filter,
doc_2.id,
doc_2,
&doc_2,
);
assert_eq!(count_nodes(&index), 5); //
}
Expand All @@ -899,14 +911,21 @@ mod tests {
text: "ab ef".to_string(),
};

add_document_to_index(&mut index, &field_accessors, tokenizer, filter, doc.id, doc);
add_document_to_index(
&mut index,
&field_accessors,
tokenizer,
filter,
doc.id,
&doc,
);
add_document_to_index(
&mut index,
&field_accessors,
tokenizer,
filter,
doc_2.id,
doc_2,
&doc_2,
);
assert_eq!(count_nodes(&index), 7); //
}
Expand Down
19 changes: 12 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,15 @@ pub mod test_util {
id: usize,
title: String,
}
fn tokenizer(s: &str) -> Vec<String> {
s.split(' ')
.map(|slice| slice.to_owned().to_lowercase())
.collect::<Vec<String>>()
fn tokenizer(s: &str) -> Vec<&str> {
s.split(' ').collect::<Vec<_>>()
}
fn title_extract(d: &Doc) -> Option<&str> {
Some(d.title.as_str())
}

fn filter(s: &str) -> String {
s.to_owned()
fn filter(s: &str) -> &str {
s
}

pub fn test_score<'arena, M, S: ScoreCalculator<usize, M>>(
Expand Down Expand Up @@ -73,7 +71,14 @@ pub mod test_util {
id: i,
title: title.to_string(),
};
add_document_to_index(&mut index, &[title_extract], tokenizer, filter, doc.id, doc);
add_document_to_index(
&mut index,
&[title_extract],
tokenizer,
filter,
doc.id,
&doc,
);
}
index
}
Expand Down
35 changes: 17 additions & 18 deletions src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ pub fn query<T: Eq + Hash + Clone + Debug, M, S: ScoreCalculator<T, M>>(
for (query_term_index, query_term_pre_filter) in query_terms.iter().enumerate() {
let query_term = filter(query_term_pre_filter);
if !query_term.is_empty() {
let expanded_terms = expand_term(index, &query_term, &index.arena_index);
let expanded_terms = expand_term(index, query_term, &index.arena_index);
let mut visited_documents_for_term: HashSet<T> = HashSet::new();
for query_term_expanded in expanded_terms {
let term_node_option =
Expand All @@ -92,8 +92,8 @@ pub fn query<T: Eq + Hash + Clone + Debug, M, S: ScoreCalculator<T, M>>(
if document_frequency > 0 {
let term_expansion_data = TermData {
query_term_index,
all_query_terms: &query_terms,
query_term: &query_term,
all_query_terms: query_terms.clone(),
query_term,
query_term_expanded: &query_term_expanded,
};
let pre_calculations = &score_calculator.before_each(
Expand Down Expand Up @@ -224,13 +224,12 @@ mod tests {
Some(d.text.as_str())
}

pub fn tokenizer(s: &str) -> Vec<String> {
s.split(' ')
.map(|slice| slice.to_owned())
.collect::<Vec<String>>()
pub fn tokenizer(s: &str) -> Vec<&str> {
s.split(' ').collect::<Vec<_>>()
}
pub fn filter(s: &str) -> String {
s.to_owned()

pub fn filter(s: &str) -> &str {
s
}

pub mod query {
Expand Down Expand Up @@ -258,7 +257,7 @@ mod tests {
tokenizer,
filter,
doc.id,
doc,
&doc,
);
}
let result = query(
Expand Down Expand Up @@ -301,7 +300,7 @@ mod tests {
tokenizer,
filter,
doc.id,
doc,
&doc,
);
}

Expand Down Expand Up @@ -357,7 +356,7 @@ mod tests {
tokenizer,
filter,
doc.id,
doc,
&doc,
);
}

Expand Down Expand Up @@ -401,13 +400,13 @@ mod tests {
tokenizer,
filter,
doc.id,
doc,
&doc,
);
}

fn custom_filter(s: &str) -> String {
fn custom_filter(s: &str) -> &str {
if s == "a" {
return "".to_string();
return "";
}
filter(s)
}
Expand Down Expand Up @@ -446,7 +445,7 @@ mod tests {
tokenizer,
filter,
doc.id,
doc,
&doc,
);
}

Expand Down Expand Up @@ -505,7 +504,7 @@ mod tests {
tokenizer,
filter,
doc.id,
doc,
&doc,
);
}
let exp = expand_term(&index, &"a".to_string(), &index.arena_index);
Expand Down Expand Up @@ -535,7 +534,7 @@ mod tests {
tokenizer,
filter,
doc.id,
doc,
&doc,
);
}
let exp = expand_term(&index, &"x".to_string(), &index.arena_index);
Expand Down
2 changes: 1 addition & 1 deletion src/query/score/calculator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pub struct TermData<'a> {
pub query_term_expanded: &'a str,

// All available query terms
pub all_query_terms: &'a Vec<String>,
pub all_query_terms: Vec<&'a str>,
}

pub struct FieldData<'a> {
Expand Down
Loading

0 comments on commit e161b54

Please sign in to comment.