Skip to content

Commit

Permalink
bump v
Browse files Browse the repository at this point in the history
  • Loading branch information
marcus-pousette committed Aug 7, 2021
2 parents 632ad78 + 6990a3a commit b5c2fda
Show file tree
Hide file tree
Showing 8 changed files with 311 additions and 129 deletions.
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,6 @@ harness = false

[profile.dev]
opt-level = 0
debug = true
debug = true


68 changes: 29 additions & 39 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,6 @@ fn add_inverted_index_doc<T: Clone>(
doc.next = Some(first);
}
let doc_index = arena_doc.insert(doc);

node_value.first_doc = Some(doc_index);
}

Expand All @@ -295,46 +294,41 @@ pub fn add_document_to_index<T: Eq + Hash + Copy, D>(
) {
let docs = &mut index.docs;
let fields = &mut index.fields;
let mut field_length = Vec::new();
let mut field_length = vec![0; fields.len()];
let mut term_counts: HashMap<String, Vec<usize>> = HashMap::new();
let mut all_terms: Vec<String> = Vec::new();
for i in 0..fields.len() {
match field_accessors[i](&doc) {
None => {
field_length.push(0);
}
Some(field_value) => {
let fields_len = fields.len();
let mut field_details = fields.get_mut(i).unwrap();

// tokenize text
let terms = tokenizer(field_value);

// filter and count terms, ignore empty strings
let mut filtered_terms_count = 0;
for mut term in terms {
term = filter(&term);
if !term.is_empty() {
all_terms.push(term.to_owned());
filtered_terms_count += 1;
let counts = term_counts.get_mut(&term);
match counts {
None => {
let mut new_count = vec![0; fields_len];
new_count[i] += 1;
term_counts.insert(term, new_count);
}
Some(c) => {
c[i] += 1;
}
if let Some(field_value) = field_accessors[i](&doc) {
let fields_len = fields.len();
let mut field_details = fields.get_mut(i).unwrap();

// tokenize text
let terms = tokenizer(field_value);

// filter and count terms, ignore empty strings
let mut filtered_terms_count = 0;
for mut term in terms {
term = filter(&term);
if !term.is_empty() {
all_terms.push(term.to_owned());
filtered_terms_count += 1;
let counts = term_counts.get_mut(&term);
match counts {
None => {
let mut new_count = vec![0; fields_len];
new_count[i] += 1;
term_counts.insert(term, new_count);
}
Some(c) => {
c[i] += 1;
}
}
}

field_details.sum += filtered_terms_count;
field_details.avg = field_details.sum as f64 / (docs.len() as f64 + 1_f64);
field_length.push(filtered_terms_count);
}

field_details.sum += filtered_terms_count;
field_details.avg = field_details.sum as f64 / (docs.len() as f64 + 1_f64);
field_length[i] = filtered_terms_count;
}
}

Expand Down Expand Up @@ -387,10 +381,7 @@ fn create_inverted_index_nodes<T: Clone>(
term: &str,
start: &usize,
) -> ArenaIndex<InvertedIndexNode<T>> {
for (i, char) in term.chars().enumerate() {
if &i < start {
continue;
}
for char in term.chars().skip(start.to_owned()) {
let new_node = arena_index.insert(create_inverted_index_node(&char));
let new_parent = {
add_inverted_index_child_node(parent, new_node, arena_index); // unsafe { .get().as_mut().unwrap() }
Expand All @@ -414,7 +405,6 @@ pub fn remove_document_from_index<T: Hash + Eq + Copy>(
removed: &mut HashSet<T>,
key: T,
) {
//
let fields = &mut index.fields;
let doc_details_option = index.docs.get(&key);
let mut remove_key = false;
Expand Down
4 changes: 3 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pub mod test_util {
}
fn tokenizer(s: &str) -> Vec<String> {
s.split(' ')
.map(|slice| slice.to_owned())
.map(|slice| slice.to_owned().to_lowercase())
.collect::<Vec<String>>()
}
fn title_extract(d: &Doc) -> Option<&str> {
Expand Down Expand Up @@ -54,6 +54,8 @@ pub mod test_util {
sort
});

assert_eq!(expected.len(), results.len());

for (index, result) in results.iter().enumerate() {
assert_eq!(expected[index], *result);
assert_eq!(approx_equal(expected[index].score, result.score, 8), true)
Expand Down
23 changes: 12 additions & 11 deletions src/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,8 @@ pub fn query<T: Eq + Hash + Clone + Debug, M, S: ScoreCalculator<T, M>>(
) -> Vec<QueryResult<T>> {
let query_terms = tokenizer(query);
let mut scores: HashMap<T, f64> = HashMap::new();
for query_term_pre_filter in &query_terms {
for (query_term_index, query_term_pre_filter) in query_terms.iter().enumerate() {
let query_term = filter(query_term_pre_filter);
print!("{}", query_term);
if !query_term.is_empty() {
let expanded_terms = expand_term(index, &query_term, &index.arena_index);
let mut visited_documents_for_term: HashSet<T> = HashSet::new();
Expand All @@ -92,6 +91,7 @@ pub fn query<T: Eq + Hash + Clone + Debug, M, S: ScoreCalculator<T, M>>(
if let Some(term_node_option_first_doc) = term_node.first_doc {
if document_frequency > 0 {
let term_expansion_data = TermData {
query_term_index,
all_query_terms: &query_terms,
query_term: &query_term,
query_term_expanded: &query_term_expanded,
Expand All @@ -112,6 +112,7 @@ pub fn query<T: Eq + Hash + Clone + Debug, M, S: ScoreCalculator<T, M>>(
pre_calculations.as_ref(),
pointer_borrowed,
index.docs.get(key).unwrap(),
&term_node_index,
&FieldData {
fields_boost,
fields,
Expand All @@ -125,9 +126,9 @@ pub fn query<T: Eq + Hash + Clone + Debug, M, S: ScoreCalculator<T, M>>(
visited_documents_for_term.contains(key),
);
scores.insert(key.to_owned(), new_score);
visited_documents_for_term.insert(key.to_owned());
}
}
visited_documents_for_term.insert(key.to_owned());
pointer = pointer_borrowed.next;
}
}
Expand All @@ -141,11 +142,10 @@ pub fn query<T: Eq + Hash + Clone + Debug, M, S: ScoreCalculator<T, M>>(
for (key, score) in scores {
result.push(QueryResult { key, score });
}
score_calculator.finalize(&mut result);

result.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());

score_calculator.finalize(&mut result);

result
}

Expand Down Expand Up @@ -216,19 +216,20 @@ mod tests {
title: String,
text: String,
}
fn tokenizer(s: &str) -> Vec<String> {
s.split(' ')
.map(|slice| slice.to_owned())
.collect::<Vec<String>>()
}

fn title_extract(d: &Doc) -> Option<&str> {
Some(d.title.as_str())
}
fn text_extract(d: &Doc) -> Option<&str> {
Some(d.text.as_str())
}

fn filter(s: &str) -> String {
pub fn tokenizer(s: &str) -> Vec<String> {
s.split(' ')
.map(|slice| slice.to_owned())
.collect::<Vec<String>>()
}
pub fn filter(s: &str) -> String {
s.to_owned()
}

Expand Down
6 changes: 5 additions & 1 deletion src/query/score/calculator.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
use crate::{
index::{DocumentDetails, DocumentPointer, FieldDetails},
index::{DocumentDetails, DocumentPointer, FieldDetails, InvertedIndexNode},
query::QueryResult,
};
use std::{collections::HashMap, fmt::Debug};
use typed_generational_arena::StandardIndex as ArenaIndex;

pub struct TermData<'a> {
// The current query term
pub query_term_index: usize,

pub query_term: &'a str,

// The current expanded term from the expanded terms generated from the current query term `query_term`
Expand Down Expand Up @@ -58,6 +61,7 @@ pub trait ScoreCalculator<T: Debug, M> {
before_output: Option<&M>,
document_pointer: &DocumentPointer<T>,
document_details: &DocumentDetails<T>,
index_node: &ArenaIndex<InvertedIndexNode<T>>,
field_data: &FieldData,
term_expansion: &TermData,
) -> Option<f64>;
Expand Down
4 changes: 3 additions & 1 deletion src/query/score/default/bm25.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
use std::{collections::HashMap, fmt::Debug};

use crate::{
index::{DocumentDetails, DocumentPointer},
index::{DocumentDetails, DocumentPointer, InvertedIndexNode},
query::score::calculator::{FieldData, ScoreCalculator, TermData},
};
use typed_generational_arena::StandardIndex as ArenaIndex;

pub struct BM25 {
/// `bm25k1` BM25 ranking function constant `k1`, controls non-linear term frequency normalization (saturation).
Expand Down Expand Up @@ -62,6 +63,7 @@ impl<T: Debug> ScoreCalculator<T, BM25TermCalculations> for BM25 {
before_output: Option<&BM25TermCalculations>,
document_pointer: &DocumentPointer<T>,
document_details: &DocumentDetails<T>,
_: &ArenaIndex<InvertedIndexNode<T>>,
field_data: &FieldData,
_: &TermData,
) -> Option<f64> {
Expand Down
Loading

0 comments on commit b5c2fda

Please sign in to comment.