Skip to content

Commit

Permalink
feat: fixed empty query on atomic roots
Browse files Browse the repository at this point in the history
- Previously only object and array roots were supported.

Ref: #160
  • Loading branch information
V0ldek committed Sep 22, 2023
1 parent 4b9400c commit 81b955e
Show file tree
Hide file tree
Showing 15 changed files with 263 additions and 104 deletions.
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"memmap",
"memmem",
"Mmap",
"mmaps",
"movemask",
"ndash",
"nondescendant",
Expand Down
18 changes: 17 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,25 @@ All notable changes to this project will be documented in this file.

## [unreleased]

### Performance

- Improved handling of the root-only query `$`. ([#160](https://github.com/V0ldek/rsonpath/issues/160))
- Full nodes result when asking for root: 2 times throughput increase.
- Indices/count result when asking for root: basically unboundedly faster,
no longer looks at the entire document.

### Documentation

- Clarified that the `approximate_spans` guarantees.
- Now documentation mentions that the returned `MatchSpan`s can potentially
have their end indices farther than one would expect the input to logically end,
due to internal padding.

### Bug fixes

- Fixed a bug when head-skipping to a single-byte key would panic.
- Fixed handling of the root-only query `$` on atomic documents. ([#160](https://github.com/V0ldek/rsonpath/issues/160))
- Previously only object and array roots were supported.
- Fixed a bug when head-skipping to a single-byte key would panic. ([#281](https://github.com/V0ldek/rsonpath/issues/281))
- This was detected by fuzzing!
- The queries `$..["{"]` and `$..["["]` would panic
on inputs starting with the bytes `{"` or `["`, respectively.
Expand Down
6 changes: 5 additions & 1 deletion crates/rsonpath-lib/src/engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ mod head_skipping;
pub mod main;
mod tail_skipping;
pub use main::MainEngine as RsonpathEngine;
mod empty_query;

use self::error::EngineError;
use crate::{
Expand Down Expand Up @@ -57,7 +58,10 @@ pub trait Engine {
/// Find the approximate spans of matches on the given [`Input`] and write them to the [`Sink`].
///
/// "Approximate" means that the ends of spans are not guaranteed to be located exactly at the end of a match,
/// but may include trailing whitespace. It is guaranteed that:
/// but may include trailing whitespace. **Importantly**, it may be beyond the rigidly-defined length of the
/// input, as the engine is allowed to pad the input with whitespace to help with processing. For the purposes
/// of this API, it is assumed that every character after the logical end of the input is a whitespace character.
/// With that in mind, it is guaranteed that:
/// 1. the span start is exact;
/// 2. the span encompasses the entire matched value;
/// 3. the only characters included after the value are JSON whitespace characters:
Expand Down
118 changes: 118 additions & 0 deletions crates/rsonpath-lib/src/engine/empty_query.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
//! Special case handlers for the empty query $.
//!
//! The main engine is built with a path automaton in mind, and for simplicity
//! we assume the root opening was already read. This makes it incompatible with
//! an empty query. Instead of rewriting the engine we provide fast-path implementations
//! here.
use crate::{
engine::{error::EngineError, Input},
is_json_whitespace,
result::{empty::EmptyRecorder, Match, MatchCount, MatchIndex, MatchSpan, Sink},
FallibleIterator, BLOCK_SIZE,
};

/// Count for an empty query – determine if the root exists.
pub(super) fn count<I>(input: &I) -> Result<MatchCount, EngineError>
where
I: Input,
{
// Assuming a correct JSON, there is either one root if any non-whitespace character
// occurs in the document, or the document is empty.
if input.seek_non_whitespace_forward(0)?.is_some() {
Ok(1)
} else {
Ok(0)
}
}

/// Index for an empty query &ndash; determine the first index of the root.
pub(super) fn index<I, S>(input: &I, sink: &mut S) -> Result<(), EngineError>
where
I: Input,
S: Sink<MatchIndex>,
{
// Assuming a correct JSON, the root starts at the first non-whitespace character, if any.
if let Some((first_idx, _)) = input.seek_non_whitespace_forward(0)? {
sink.add_match(first_idx)
.map_err(|err| EngineError::SinkError(Box::new(err)))?;
}

Ok(())
}

/// Approximate span for an empty query &ndash; determine the first index and the length of the root.
pub(super) fn approx_span<I, S>(input: &I, sink: &mut S) -> Result<(), EngineError>
where
I: Input,
S: Sink<MatchSpan>,
{
// The root spans the entire document, by definition, with the exception of whitespace.
// We need to find the start index exactly, and then can return the length of the rest as the approximate
// length of the root.
//
// Some input know their lengths: bytes already in memory, file mmaps, etc.
// A BufferedInput over an arbitrary Read stream cannot know its length, so we actually
// need to iterate until the end and count the bytes.
if let Some((first_idx, _)) = input.seek_non_whitespace_forward(0)? {
let end_idx = match input.len_hint() {
Some(end_idx) => end_idx, // Known length, just take it.
None => {
// Unknown length, iterate and count.
let mut iter = input.iter_blocks::<_, BLOCK_SIZE>(&EmptyRecorder);
let mut end_idx = 0;

while (iter.next()?).is_some() {
end_idx += BLOCK_SIZE;
}

end_idx
}
};

sink.add_match(MatchSpan::from_indices(first_idx, end_idx))
.map_err(|err| EngineError::SinkError(Box::new(err)))?;
}

Ok(())
}

/// Match for an empty query &ndash; copy the entire document, trimming whitespace.
pub(super) fn match_<I, S>(input: &I, sink: &mut S) -> Result<(), EngineError>
where
I: Input,
S: Sink<Match>,
{
// For a full match we need to copy the entire input starting from first non-whitespace,
// and then trim the whitespace from the end. This might be slow if the document is excessively
// padded with whitespace at start and/or end, but that's a pathological case.
let mut iter = input.iter_blocks::<_, BLOCK_SIZE>(&EmptyRecorder);
let mut res: Vec<u8> = vec![];
let mut first_significant_idx = None;

while let Some(block) = iter.next()? {
if first_significant_idx.is_none() {
// Start of the root not found yet, look for it.
first_significant_idx = block.iter().position(|&x| !is_json_whitespace(x));

if let Some(first_idx) = first_significant_idx {
// Start of the root found in this block, copy the relevant part.
res.extend(&block[first_idx..]);
}
} else {
// Start of the root was already found, now we are copying everything.
res.extend(&*block);
}
}

if let Some(start) = first_significant_idx {
// Trim whitespace if we have a result.
while !res.is_empty() && is_json_whitespace(res[res.len() - 1]) {
res.pop();
}

sink.add_match(Match::from_start_and_bytes(start, res))
.map_err(|err| EngineError::SinkError(Box::new(err)))?;
}

Ok(())
}
44 changes: 5 additions & 39 deletions crates/rsonpath-lib/src/engine/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use crate::{
debug,
depth::Depth,
engine::{
empty_query,
error::EngineError,
head_skipping::{CanHeadSkip, HeadSkip, ResumeState},
tail_skipping::TailSkip,
Expand Down Expand Up @@ -70,8 +71,7 @@ impl Engine for MainEngine<'_> {
let recorder = CountRecorder::new();

if self.automaton.is_empty_query() {
empty_query(input, &recorder, self.simd)?;
return Ok(recorder.into());
return empty_query::count(input);
}

simd_dispatch!(self.simd => |simd| {
Expand All @@ -91,7 +91,7 @@ impl Engine for MainEngine<'_> {
let recorder = IndexRecorder::new(sink);

if self.automaton.is_empty_query() {
return empty_query(input, &recorder, self.simd);
return empty_query::index(input, sink);
}

simd_dispatch!(self.simd => |simd| {
Expand All @@ -111,7 +111,7 @@ impl Engine for MainEngine<'_> {
let recorder = ApproxSpanRecorder::new(sink);

if self.automaton.is_empty_query() {
return empty_query(input, &recorder, self.simd);
return empty_query::approx_span(input, sink);
}

simd_dispatch!(self.simd => |simd| {
Expand All @@ -131,7 +131,7 @@ impl Engine for MainEngine<'_> {
let recorder = NodesRecorder::build_recorder(sink);

if self.automaton.is_empty_query() {
return empty_query(input, &recorder, self.simd);
return empty_query::match_(input, sink);
}

simd_dispatch!(self.simd => |simd| {
Expand All @@ -143,40 +143,6 @@ impl Engine for MainEngine<'_> {
}
}

fn empty_query<'i, I, R>(input: &'i I, recorder: &R, simd: SimdConfiguration) -> Result<(), EngineError>
where
I: Input + 'i,
R: Recorder<I::Block<'i, BLOCK_SIZE>>,
{
let mut iter = input.iter_blocks(recorder);
let mut idx = 0;

loop {
match iter.next()? {
Some(block) => {
let pos = block.iter().position(|&x| x != b' ' && x != b'\n' && x != b'\t' && x != b'\r');
match pos {
Some(in_block_idx) => {
recorder.record_match(idx + in_block_idx, Depth::ONE, MatchedNodeType::Atomic)?;
idx += BLOCK_SIZE;
break;
},
None => idx += BLOCK_SIZE,
}
},
None => return Ok(()),
}
}

while (iter.next()?).is_some() {
idx += BLOCK_SIZE;
}

recorder.record_value_terminator(idx - 1, Depth::ONE)?;

Ok(())
}

macro_rules! Classifier {
() => {
TailSkip<
Expand Down
23 changes: 17 additions & 6 deletions crates/rsonpath-lib/src/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,17 @@ pub trait Input: Sized {
where
Self: 'i;

/// Return the length of the entire input, if known.
///
/// This is meant to be used merely as a hint.
/// There are [`Input`] implementations that may not be able to know the entire
/// length a priori, and they should return [`None`].
#[inline(always)]
#[must_use]
fn len_hint(&self) -> Option<usize> {
None
}

/// Iterate over blocks of size `N` of the input.
/// `N` has to be a power of two larger than 1.
#[must_use]
Expand Down Expand Up @@ -293,11 +304,11 @@ mod tests {
use pretty_assertions::assert_eq;

#[test]
fn on_empty_bytes_is_all_zero() {
fn on_empty_bytes_is_all_whitespace() {
let result = in_slice::pad_last_block(&[]);

assert_eq!(result.absolute_start, 0);
assert_eq!(result.bytes, [0; MAX_BLOCK_SIZE]);
assert_eq!(result.bytes, [b' '; MAX_BLOCK_SIZE]);
}

#[test]
Expand All @@ -308,17 +319,17 @@ mod tests {

assert_eq!(result.absolute_start, 0);
assert_eq!(&result.bytes[0..11], bytes);
assert_eq!(&result.bytes[11..], [0; MAX_BLOCK_SIZE - 11]);
assert_eq!(&result.bytes[11..], [b' '; MAX_BLOCK_SIZE - 11]);
}

#[test]
fn on_bytes_equal_to_full_block_gives_all_zero() {
fn on_bytes_equal_to_full_block_gives_all_whitespace() {
let bytes = [42; MAX_BLOCK_SIZE];

let result = in_slice::pad_last_block(&bytes);

assert_eq!(result.absolute_start, MAX_BLOCK_SIZE);
assert_eq!(result.bytes, [0; MAX_BLOCK_SIZE]);
assert_eq!(result.bytes, [b' '; MAX_BLOCK_SIZE]);
}

#[test]
Expand All @@ -330,7 +341,7 @@ mod tests {

assert_eq!(result.absolute_start, 2 * MAX_BLOCK_SIZE);
assert_eq!(result.bytes[0..77], [69; 77]);
assert_eq!(result.bytes[77..], [0; MAX_BLOCK_SIZE - 77]);
assert_eq!(result.bytes[77..], [b' '; MAX_BLOCK_SIZE - 77]);
}
}

Expand Down
5 changes: 5 additions & 0 deletions crates/rsonpath-lib/src/input/borrowed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ impl<'a> Input for BorrowedBytes<'a> {

type Block<'b, const N: usize> = &'b [u8] where Self: 'b;

#[inline(always)]
fn len_hint(&self) -> Option<usize> {
Some((self.bytes.len() / MAX_BLOCK_SIZE + 1) * MAX_BLOCK_SIZE)
}

#[inline(always)]
fn iter_blocks<'b, 'r, R, const N: usize>(&'b self, recorder: &'r R) -> Self::BlockIterator<'b, 'r, N, R>
where
Expand Down
7 changes: 6 additions & 1 deletion crates/rsonpath-lib/src/input/mmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
//! by an order of magnitude to execute the query on a memory map than it is to simply read the
//! file into main memory.

use super::{borrowed::BorrowedBytesBlockIterator, error::InputError, in_slice, Input, LastBlock};
use super::{borrowed::BorrowedBytesBlockIterator, error::InputError, in_slice, Input, LastBlock, MAX_BLOCK_SIZE};
use crate::{query::JsonString, result::InputRecorder};
use memmap2::{Mmap, MmapAsRawDesc};

Expand Down Expand Up @@ -55,6 +55,11 @@ impl Input for MmapInput {

type Block<'a, const N: usize> = &'a [u8];

#[inline(always)]
fn len_hint(&self) -> Option<usize> {
Some((self.mmap.len() / MAX_BLOCK_SIZE + 1) * MAX_BLOCK_SIZE)
}

#[inline(always)]
fn iter_blocks<'a, 'r, R, const N: usize>(&'a self, recorder: &'r R) -> Self::BlockIterator<'a, 'r, N, R>
where
Expand Down
5 changes: 5 additions & 0 deletions crates/rsonpath-lib/src/input/owned.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,11 @@ impl Input for OwnedBytes {

type Block<'a, const N: usize> = &'a [u8];

#[inline(always)]
fn len_hint(&self) -> Option<usize> {
Some((self.len / MAX_BLOCK_SIZE + 1) * MAX_BLOCK_SIZE)
}

#[inline(always)]
fn iter_blocks<'a, 'r, R, const N: usize>(&'a self, recorder: &'r R) -> Self::BlockIterator<'a, 'r, N, R>
where
Expand Down
6 changes: 6 additions & 0 deletions crates/rsonpath-lib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -346,3 +346,9 @@ impl<F: FallibleIterator> Iterator for FallibleIntoIter<F> {
}
}
}

#[inline(always)]
#[must_use]
pub(crate) fn is_json_whitespace(x: u8) -> bool {
x == b' ' || x == b'\t' || x == b'\n' || x == b'\r'
}
Loading

0 comments on commit 81b955e

Please sign in to comment.