Skip to content

Commit

Permalink
Merge pull request #17 from tmpfs/query
Browse files Browse the repository at this point in the history
Query method is immutable reference
  • Loading branch information
marcus-pousette committed Aug 20, 2022
2 parents a2cef1d + 326c5bf commit 67e52f9
Show file tree
Hide file tree
Showing 8 changed files with 109 additions and 126 deletions.
7 changes: 2 additions & 5 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[package]
name = "probly-search"
description = "A lightweight full-text search engine with a fully customizable scoring function"
version = "2.0.0-alpha-1"
version = "2.0.0-alpha-2"
authors = ["marcus-pousette <marcus.pousette@quantleaf.com>"]
edition = "2018"
edition = "2021"
license = "MIT"
homepage = "https://github.com/quantleaf/probly-search"
repository = "https://github.com/quantleaf/probly-search"
Expand All @@ -28,9 +28,6 @@ crate-type = ["cdylib", "rlib"]
name = "test_benchmark"
harness = false


[profile.dev]
opt-level = 0
debug = true


7 changes: 2 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ let mut result = index.query(
tokenizer,
filter,
&[1., 1.],
None,
);
assert_eq!(result.len(), 2);
assert_eq!(
Expand All @@ -128,11 +127,10 @@ assert_eq!(
);

// Remove documents from index
let mut removed_docs = HashSet::new();
index.remove_document(&mut removed_docs, doc_1.id);
index.remove_document(doc_1.id);

// Vacuum to remove completely
index.vacuum(&mut removed_docs);
index.vacuum();

// Search, expect 1 result
result = index.query(
Expand All @@ -141,7 +139,6 @@ result = index.query(
tokenizer,
filter,
&[1., 1.],
Some(&removed_docs),
);
assert_eq!(result.len(), 1);
assert_eq!(
Expand Down
10 changes: 6 additions & 4 deletions benches/test_benchmark.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use criterion::{criterion_group, criterion_main, Criterion};
use probly_search::Index;
use std::borrow::Cow;

criterion_group!(benches, test_speed);
criterion_main!(benches);
Expand All @@ -8,11 +9,12 @@ struct DocX {
title: String,
}

fn filter(s: &str) -> &str {
s
fn filter(s: &str) -> Cow<'_, str> {
Cow::from(s)
}
fn tokenizer(s: &str) -> Vec<&str> {
s.split(' ').collect::<Vec<_>>()

fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
s.split(' ').map(Cow::from).collect::<Vec<_>>()
}

pub fn test_speed(c: &mut Criterion) {
Expand Down
106 changes: 59 additions & 47 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,27 +26,19 @@ pub struct Index<T> {

pub(crate) arena_index: StandardArena<InvertedIndexNode<T>>,
pub(crate) arena_doc: StandardArena<DocumentPointer<T>>,

/// Documents that have been removed from the index but
/// need to be purged.
removed: Option<HashSet<T>>,
}

impl<T: Eq + Hash + Copy + Debug> Index<T> {
/**
Creates an Index.
* typeparam `T` Document key.
* `fieldsNum` Number of fields.
* returns `Index`
*/
/// Creates an index.
pub fn new(fields_num: usize) -> Self {
Self::new_with_capacity(fields_num, 1000, 10000)
}

/**
Creates an Index.
* typeparam `T` Document key.
* `fieldsNum` Number of fields.
* `expected_index_size` Expected node count of index tree.
* `expected_documents_count` Expected amount of documents added
* returns `Index`
*/
/// Creates an index with the expected capacity.
pub fn new_with_capacity(
fields_num: usize,
expected_index_size: usize,
Expand All @@ -63,6 +55,7 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
fields,
arena_doc,
arena_index,
removed: None,
}
}

Expand All @@ -74,6 +67,12 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
self.arena_index.get_mut(self.root).unwrap()
}

/// Collection of documents that have been removed from the index
/// but not yet purged.
pub(crate) fn removed_documents(&self) -> Option<&HashSet<T>> {
self.removed.as_ref()
}

/// Adds a document to the index.
pub fn add_document<D>(
&mut self,
Expand All @@ -86,8 +85,8 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
let docs = &mut self.docs;
let fields = &mut self.fields;
let mut field_length = vec![0; fields.len()];
let mut term_counts: HashMap<&str, Vec<usize>> = HashMap::new();
let mut all_terms: Vec<&str> = Vec::new();
let mut term_counts: HashMap<String, Vec<usize>> = HashMap::new();
let mut all_terms: Vec<String> = Vec::new();
for i in 0..fields.len() {
if let Some(field_value) = field_accessors[i](doc) {
let fields_len = fields.len();
Expand All @@ -98,22 +97,16 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {

// filter and count terms, ignore empty strings
let mut filtered_terms_count = 0;
for mut term in terms {
term = filter(term);
for term in terms {
let filtered = filter(term.as_ref());
let term = filtered.as_ref().to_owned();
if !term.is_empty() {
all_terms.push(term);
all_terms.push(term.clone());
filtered_terms_count += 1;
let counts = term_counts.get_mut(term);
match counts {
None => {
let mut new_count = vec![0; fields_len];
new_count[i] += 1;
term_counts.insert(term, new_count);
}
Some(c) => {
c[i] += 1;
}
}
let counts = term_counts
.entry(term)
.or_insert_with(|| vec![0; fields_len]);
counts[i] += 1;
}
}

Expand All @@ -130,7 +123,7 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
let node = self.arena_index.get(node_index).unwrap();
if node.first_child.is_none() {
node_index =
create_inverted_index_nodes(&mut self.arena_index, node_index, term, &i);
create_inverted_index_nodes(&mut self.arena_index, node_index, &term, &i);
break;
}
let next_node = Index::<T>::find_inverted_index_node_child_nodes_by_char(
Expand All @@ -143,7 +136,7 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
node_index = create_inverted_index_nodes(
&mut self.arena_index,
node_index,
term,
&term,
&i,
);
break;
Expand All @@ -158,15 +151,21 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
DocumentPointer {
next: None,
details_key: key.to_owned(),
term_frequency: term_counts[term].to_owned(),
term_frequency: term_counts[&term].to_owned(),
},
&mut self.arena_doc,
)
}
}

/// Remove document from the index.
pub fn remove_document(&mut self, removed: &mut HashSet<T>, key: T) {
pub fn remove_document(&mut self, key: T) {
if self.removed.is_none() {
self.removed = Some(Default::default());
}
let removed = self.removed.as_mut().unwrap();

//let mut removed = HashSet::new();
let fields = &mut self.fields;
let doc_details_option = self.docs.get(&key);
let mut remove_key = false;
Expand All @@ -193,9 +192,11 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
}

/// Cleans up removed documents from the index.
pub fn vacuum(&mut self, removed: &mut HashSet<T>) {
self.vacuum_node(self.root, removed);
pub fn vacuum(&mut self) {
let mut removed = self.removed.take().unwrap_or_default();
self.vacuum_node(self.root, &removed);
removed.clear();
self.removed = None;
}

/// Recursively cleans up removed documents from the index.
Expand Down Expand Up @@ -278,6 +279,24 @@ impl<T: Eq + Hash + Copy + Debug> Index<T> {
document_frequency
}

/// Count the document frequency.
pub(crate) fn count_documents(&self, node_index: ArenaIndex<InvertedIndexNode<T>>) -> usize {
let node = self.arena_index.get(node_index).unwrap();
let mut pointer_option = node.first_doc;
let mut document_frequency = 0;
while let Some(pointer) = pointer_option {
let is_removed = match &self.removed {
Some(set) => set.contains(&self.arena_doc.get(pointer).unwrap().details_key),
None => false,
};
if !is_removed {
document_frequency += 1;
}
pointer_option = self.arena_doc.get(pointer).unwrap().next;
}
document_frequency
}

/// Finds inverted index node that matches the `term`.
pub(crate) fn find_inverted_index_node(
node: ArenaIndex<InvertedIndexNode<T>>,
Expand Down Expand Up @@ -435,9 +454,10 @@ fn create_inverted_index_nodes<T: Clone>(

#[cfg(test)]
mod tests {

use super::*;

use crate::test_util::{filter, tokenizer};

/// Count the amount of nodes of the index.
///
/// Returns the amount, including root node
Expand Down Expand Up @@ -466,13 +486,6 @@ mod tests {
text: String,
}

fn tokenizer(s: &str) -> Vec<&str> {
s.split(' ').collect::<Vec<_>>()
}

fn filter(s: &str) -> &str {
s
}
fn field_accessor(doc: &Doc) -> Option<&str> {
Some(doc.text.as_str())
}
Expand Down Expand Up @@ -616,7 +629,6 @@ mod tests {
let mut index = Index::<usize>::new(1);
assert_eq!(index.arena_doc.is_empty(), true);

let mut removed = HashSet::new();
let docs = vec![Doc {
id: 1,
text: "a".to_string(),
Expand All @@ -626,8 +638,8 @@ mod tests {
index.add_document(&[field_accessor], tokenizer, filter, doc.id, &doc)
}

index.remove_document(&mut removed, 1);
index.vacuum(&mut removed);
index.remove_document(1);
index.vacuum();

assert_eq!(index.docs.len(), 0);
assert_eq!(index.fields.len(), 1);
Expand Down
16 changes: 9 additions & 7 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::borrow::Cow;

mod index;
mod query;
pub mod score;
Expand All @@ -9,15 +11,16 @@ pub use query::QueryResult;
pub type FieldAccessor<D> = fn(&D) -> Option<&str>;

/// Function used to tokenize a field.
pub type Tokenizer = fn(&str) -> Vec<&str>;
pub type Tokenizer = fn(&str) -> Vec<Cow<'_, str>>;

/// Function used to filter fields.
pub type Filter = fn(&str) -> &str;
pub type Filter = fn(&str) -> Cow<'_, str>;

#[cfg(test)]
pub mod test_util {

use crate::{score::ScoreCalculator, Index, QueryResult};
use std::borrow::Cow;

fn approx_equal(a: f64, b: f64, dp: u8) -> bool {
let p: f64 = 10f64.powf(-(dp as f64));
Expand All @@ -39,12 +42,12 @@ pub mod test_util {
Some(d.text.as_str())
}

pub fn tokenizer(s: &str) -> Vec<&str> {
s.split(' ').collect::<Vec<_>>()
pub fn tokenizer(s: &str) -> Vec<Cow<'_, str>> {
s.split(' ').map(Cow::from).collect::<Vec<_>>()
}

pub fn filter(s: &str) -> &str {
s
pub fn filter(s: &str) -> Cow<'_, str> {
Cow::from(s)
}

pub fn test_score<'arena, M, S: ScoreCalculator<usize, M>>(
Expand All @@ -60,7 +63,6 @@ pub mod test_util {
tokenizer,
filter,
&vec![1.; fields_len],
None,
);
results.sort_by(|a, b| {
let mut sort = b.score.partial_cmp(&a.score).unwrap();
Expand Down
Loading

0 comments on commit 67e52f9

Please sign in to comment.