Skip to content

Commit

Permalink
add_document cow keys
Browse files Browse the repository at this point in the history
  • Loading branch information
marcus-pousette committed Aug 20, 2022
1 parent 67e52f9 commit cb63e36
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 138 deletions.
5 changes: 0 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,6 @@ fn description_extract(d: &Doc) -> Option<&str> {
Some(d.description.as_str())
}

// A no-op filter
fn filter(s: &str) -> &str {
s
}

// Create index with 2 fields
let mut index = Index::<usize>::new(2);

Expand Down
6 changes: 1 addition & 5 deletions benches/test_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ struct DocX {
title: String,
}

fn filter(s: &str) -> Cow<'_, str> {
Cow::from(s)
}

fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
s.split(' ').map(Cow::from).collect::<Vec<_>>()
}
Expand Down Expand Up @@ -62,6 +58,6 @@ fn add_all_documents(
id: i,
title: s.to_owned(),
};
index.add_document(extractor, tokenizer, filter, d.id, &d);
index.add_document(extractor, tokenizer, d.id, &d);
}
}
65 changes: 38 additions & 27 deletions src/index.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
use std::{
collections::{HashMap, HashSet},
fmt::{Debug, Formatter},
hash::Hash,
usize,
usize, borrow::Cow

};

use crate::{FieldAccessor, Filter, Tokenizer};
use std::collections::{HashMap, HashSet};
use crate::{FieldAccessor, Tokenizer};
extern crate typed_generational_arena;
use typed_generational_arena::StandardArena;
use typed_generational_arena::StandardIndex as ArenaIndex;
Expand Down Expand Up @@ -73,20 +74,21 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
self.removed.as_ref()
}


/// Adds a document to the index.
pub fn add_document<D>(
&mut self,
field_accessors: &[FieldAccessor<D>],
tokenizer: Tokenizer,
filter: Filter,
key: T,
doc: &D,
doc: & D,
) {
let docs = &mut self.docs;
let fields = &mut self.fields;
let mut field_length = vec![0; fields.len()];
let mut term_counts: HashMap<String, Vec<usize>> = HashMap::new();
let mut all_terms: Vec<String> = Vec::new();
let mut term_counts: HashMap<Cow<str>, Vec<usize>> = HashMap::new();
let mut all_terms: Vec<Cow<str>> = Vec::new();

for i in 0..fields.len() {
if let Some(field_value) = field_accessors[i](doc) {
let fields_len = fields.len();
Expand All @@ -98,16 +100,25 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
// filter and count terms, ignore empty strings
let mut filtered_terms_count = 0;
for term in terms {
let filtered = filter(term.as_ref());
let term = filtered.as_ref().to_owned();
if !term.is_empty() {
all_terms.push(term.clone());
if !term.is_empty() {
filtered_terms_count += 1;
let counts = term_counts
.entry(term)
.or_insert_with(|| vec![0; fields_len]);
counts[i] += 1;
}
/* let counts = term_counts
.entry(term.clone())
.or_insert_with(|| vec![0; fields_len]); */

let counts = match term_counts.get_mut(&term)
{
Some(counts) => {
counts},
None => {
term_counts.insert(term.clone(), vec![0; fields_len]);
term_counts.get_mut(&term).unwrap()
}
};
counts[i] += 1;
all_terms.push(term);

}
}

field_details.sum += filtered_terms_count;
Expand Down Expand Up @@ -456,7 +467,7 @@ fn create_inverted_index_nodes<T: Clone>(
mod tests {
use super::*;

use crate::test_util::{filter, tokenizer};
use crate::test_util::{ tokenizer};

/// Count the amount of nodes of the index.
///
Expand Down Expand Up @@ -505,7 +516,7 @@ mod tests {
text: "a b c".to_string(),
};

index.add_document(&field_accessors, tokenizer, filter, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, doc.id, &doc);

assert_eq!(index.docs.len(), 1);
let (_, added_doc) = index.docs.iter().next().unwrap();
Expand Down Expand Up @@ -561,9 +572,9 @@ mod tests {
text: "b c d".to_string(),
};

index.add_document(&field_accessors, tokenizer, filter, doc_1.id, &doc_1);
index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1);

index.add_document(&field_accessors, tokenizer, filter, doc_2.id, &doc_2);
index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);

assert_eq!(index.docs.len(), 2);
assert_eq!(
Expand All @@ -587,7 +598,7 @@ mod tests {
assert_eq!(&root.char, &char::from_u32(0).unwrap());
assert_eq!(&root.next.is_none(), &true);
assert_eq!(&root.first_doc.is_none(), &true);

let first_child = index.arena_index.get(root.first_child.unwrap()).unwrap();
assert_eq!(&first_child.char, &char::from_u32(100).unwrap());
assert_eq!(&first_child.first_child.is_none(), &true);
Expand Down Expand Up @@ -617,7 +628,7 @@ mod tests {
text: "a b".to_string(), // double space could introduce empty tokens
};

index.add_document(&field_accessors, tokenizer, filter, doc_1.id, &doc_1);
index.add_document(&field_accessors, tokenizer, doc_1.id, &doc_1);
}
}

Expand All @@ -635,7 +646,7 @@ mod tests {
}];

for doc in docs {
index.add_document(&[field_accessor], tokenizer, filter, doc.id, &doc)
index.add_document(&[field_accessor], tokenizer, doc.id, &doc)
}

index.remove_document(1);
Expand Down Expand Up @@ -754,8 +765,8 @@ mod tests {
text: "abe".to_string(),
};

index.add_document(&field_accessors, tokenizer, filter, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, filter, doc_2.id, &doc_2);
index.add_document(&field_accessors, tokenizer, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
assert_eq!(count_nodes(&index), 5); //
}

Expand All @@ -775,8 +786,8 @@ mod tests {
text: "ab ef".to_string(),
};

index.add_document(&field_accessors, tokenizer, filter, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, filter, doc_2.id, &doc_2);
index.add_document(&field_accessors, tokenizer, doc.id, &doc);
index.add_document(&field_accessors, tokenizer, doc_2.id, &doc_2);
assert_eq!(count_nodes(&index), 7); //
}

Expand Down
20 changes: 6 additions & 14 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,10 @@ pub use index::*;
pub use query::QueryResult;

/// Function that extracts a field value from a document.
pub type FieldAccessor<D> = fn(&D) -> Option<&str>;
pub type FieldAccessor< D> = fn(& D) -> Option<&str>;

/// Function used to tokenize a field.
pub type Tokenizer = fn(&str) -> Vec<Cow<'_, str>>;

/// Function used to filter fields.
pub type Filter = fn(&str) -> Cow<'_, str>;
pub type Tokenizer = fn( &str) -> Vec<Cow<'_, str>>;

#[cfg(test)]
pub mod test_util {
Expand All @@ -34,22 +31,18 @@ pub mod test_util {
pub text: String,
}

pub fn title_extract(d: &Doc) -> Option<&str> {
pub fn title_extract<'a>(d: &'a Doc) -> Option<&'a str> {
Some(d.title.as_str())
}

pub fn text_extract(d: &Doc) -> Option<&str> {
pub fn text_extract<'a>(d: &'a Doc) -> Option<&'a str> {
Some(d.text.as_str())
}

pub fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
pub fn tokenizer<'a>(s: &'a str) -> Vec<Cow<'a, str>> {
s.split(' ').map(Cow::from).collect::<Vec<_>>()
}

pub fn filter(s: &str) -> Cow<'_, str> {
Cow::from(s)
}

pub fn test_score<'arena, M, S: ScoreCalculator<usize, M>>(
idx: &mut Index<usize>,
score_calculator: &mut S,
Expand All @@ -61,7 +54,6 @@ pub mod test_util {
q,
score_calculator,
tokenizer,
filter,
&vec![1.; fields_len],
);
results.sort_by(|a, b| {
Expand Down Expand Up @@ -90,7 +82,7 @@ pub mod test_util {
title: title.to_string(),
text: String::new(),
};
index.add_document(&[title_extract], tokenizer, filter, doc.id, &doc);
index.add_document(&[title_extract], tokenizer, doc.id, &doc);
}
index
}
Expand Down
Loading

0 comments on commit cb63e36

Please sign in to comment.