Skip to content

Commit

Permalink
Move split_full_path to Schema (#1692)
Browse files Browse the repository at this point in the history
  • Loading branch information
boraarslan committed Nov 29, 2022
1 parent 1119e59 commit 4958243
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 49 deletions.
50 changes: 1 addition & 49 deletions src/query/query_parser/query_parser.rs
Expand Up @@ -183,7 +183,6 @@ pub struct QueryParser {
conjunction_by_default: bool,
tokenizer_manager: TokenizerManager,
boost: HashMap<Field, Score>,
field_names: HashMap<String, Field>,
}

fn all_negative(ast: &LogicalAst) -> bool {
Expand All @@ -196,31 +195,6 @@ fn all_negative(ast: &LogicalAst) -> bool {
}
}

// Returns the position (in byte offsets) of the unescaped '.' in the `field_path`.
//
// This function operates directly on bytes (as opposed to codepoint), relying
// on a encoding property of utf-8 for its correctness.
fn locate_splitting_dots(field_path: &str) -> Vec<usize> {
let mut splitting_dots_pos = Vec::new();
let mut escape_state = false;
for (pos, b) in field_path.bytes().enumerate() {
if escape_state {
escape_state = false;
continue;
}
match b {
b'\\' => {
escape_state = true;
}
b'.' => {
splitting_dots_pos.push(pos);
}
_ => {}
}
}
splitting_dots_pos
}

impl QueryParser {
/// Creates a `QueryParser`, given
/// * schema - index Schema
Expand All @@ -230,34 +204,19 @@ impl QueryParser {
default_fields: Vec<Field>,
tokenizer_manager: TokenizerManager,
) -> QueryParser {
let field_names = schema
.fields()
.map(|(field, field_entry)| (field_entry.name().to_string(), field))
.collect();
QueryParser {
schema,
default_fields,
tokenizer_manager,
conjunction_by_default: false,
boost: Default::default(),
field_names,
}
}

// Splits a full_path as written in a query, into a field name and a
// json path.
pub(crate) fn split_full_path<'a>(&self, full_path: &'a str) -> Option<(Field, &'a str)> {
if let Some(field) = self.field_names.get(full_path) {
return Some((*field, ""));
}
let mut splitting_period_pos: Vec<usize> = locate_splitting_dots(full_path);
while let Some(pos) = splitting_period_pos.pop() {
let (prefix, suffix) = full_path.split_at(pos);
if let Some(field) = self.field_names.get(prefix) {
return Some((*field, &suffix[1..]));
}
}
None
self.schema.find_field(full_path)
}

/// Creates a `QueryParser`, given
Expand Down Expand Up @@ -1566,13 +1525,6 @@ mod test {
assert_eq!(query_parser.split_full_path("firsty"), None);
}

#[test]
fn test_locate_splitting_dots() {
assert_eq!(&super::locate_splitting_dots("a.b.c"), &[1, 3]);
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
}

#[test]
pub fn test_phrase_slop() {
test_parse_query_to_logical_ast_helper(
Expand Down
96 changes: 96 additions & 0 deletions src/schema/schema.rs
Expand Up @@ -252,6 +252,31 @@ impl Eq for InnerSchema {}
#[derive(Clone, Eq, PartialEq, Debug)]
pub struct Schema(Arc<InnerSchema>);

// Returns the position (in byte offsets) of the unescaped '.' in the `field_path`.
//
// This function operates directly on bytes (as opposed to codepoint), relying
// on a encoding property of utf-8 for its correctness.
fn locate_splitting_dots(field_path: &str) -> Vec<usize> {
let mut splitting_dots_pos = Vec::new();
let mut escape_state = false;
for (pos, b) in field_path.bytes().enumerate() {
if escape_state {
escape_state = false;
continue;
}
match b {
b'\\' => {
escape_state = true;
}
b'.' => {
splitting_dots_pos.push(pos);
}
_ => {}
}
}
splitting_dots_pos
}

impl Schema {
/// Return the `FieldEntry` associated with a `Field`.
pub fn get_field_entry(&self, field: Field) -> &FieldEntry {
Expand Down Expand Up @@ -358,6 +383,28 @@ impl Schema {
}
Ok(doc)
}

/// Searches for a full_path in the schema, returning the field name and a JSON path.
///
/// This function works by checking if the field exists for the exact given full_path.
/// If it's not, it splits the full_path at non-escaped '.' chars and tries to match the
/// prefix with the field names, favoring the longest field names.
///
/// This does not check if field is a JSON field. It is possible for this functions to
/// return a non-empty JSON path with a non-JSON field.
pub fn find_field<'a>(&self, full_path: &'a str) -> Option<(Field, &'a str)> {
if let Some(field) = self.0.fields_map.get(full_path) {
return Some((*field, ""));
}
let mut splitting_period_pos: Vec<usize> = locate_splitting_dots(full_path);
while let Some(pos) = splitting_period_pos.pop() {
let (prefix, suffix) = full_path.split_at(pos);
if let Some(field) = self.0.fields_map.get(prefix) {
return Some((*field, &suffix[1..]));
}
}
None
}
}

impl Serialize for Schema {
Expand Down Expand Up @@ -436,6 +483,13 @@ mod tests {
use crate::schema::schema::DocParsingError::InvalidJson;
use crate::schema::*;

#[test]
fn test_locate_splitting_dots() {
assert_eq!(&super::locate_splitting_dots("a.b.c"), &[1, 3]);
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
}

#[test]
pub fn is_indexed_test() {
let mut schema_builder = Schema::builder();
Expand Down Expand Up @@ -936,4 +990,46 @@ mod tests {
]"#;
assert_eq!(schema_json, expected);
}

#[test]
fn test_find_field() {
let mut schema_builder = Schema::builder();
schema_builder.add_json_field("foo", STRING);

schema_builder.add_text_field("bar", STRING);
schema_builder.add_text_field("foo.bar", STRING);
schema_builder.add_text_field("foo.bar.baz", STRING);
schema_builder.add_text_field("bar.a.b.c", STRING);
let schema = schema_builder.build();

assert_eq!(
schema.find_field("foo.bar"),
Some((schema.get_field("foo.bar").unwrap(), ""))
);
assert_eq!(
schema.find_field("foo.bar.bar"),
Some((schema.get_field("foo.bar").unwrap(), "bar"))
);
assert_eq!(
schema.find_field("foo.bar.baz"),
Some((schema.get_field("foo.bar.baz").unwrap(), ""))
);
assert_eq!(
schema.find_field("foo.toto"),
Some((schema.get_field("foo").unwrap(), "toto"))
);
assert_eq!(
schema.find_field("foo.bar"),
Some((schema.get_field("foo.bar").unwrap(), ""))
);
assert_eq!(
schema.find_field("bar.toto.titi"),
Some((schema.get_field("bar").unwrap(), "toto.titi"))
);

assert_eq!(schema.find_field("hello"), None);
assert_eq!(schema.find_field(""), None);
assert_eq!(schema.find_field("thiswouldbeareallylongfieldname"), None);
assert_eq!(schema.find_field("baz.bar.foo"), None);
}
}

0 comments on commit 4958243

Please sign in to comment.