From 4cc38b2c66a3dfbb08afc374e896d672461deae8 Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Sat, 1 Apr 2023 13:34:03 -0700 Subject: [PATCH 1/7] Buffer internally in `IoRead` --- src/iter.rs | 70 ----------------------------------------------------- src/lib.rs | 2 -- src/read.rs | 58 +++++++++++++++++++++++++++++++++----------- 3 files changed, 44 insertions(+), 86 deletions(-) delete mode 100644 src/iter.rs diff --git a/src/iter.rs b/src/iter.rs deleted file mode 100644 index 9792916dc..000000000 --- a/src/iter.rs +++ /dev/null @@ -1,70 +0,0 @@ -use crate::io; - -pub struct LineColIterator { - iter: I, - - /// Index of the current line. Characters in the first line of the input - /// (before the first newline character) are in line 1. - line: usize, - - /// Index of the current column. The first character in the input and any - /// characters immediately following a newline character are in column 1. - /// The column is 0 immediately after a newline character has been read. - col: usize, - - /// Byte offset of the start of the current line. This is the sum of lengths - /// of all previous lines. Keeping track of things this way allows efficient - /// computation of the current line, column, and byte offset while only - /// updating one of the counters in `next()` in the common case. - start_of_line: usize, -} - -impl LineColIterator -where - I: Iterator>, -{ - pub fn new(iter: I) -> LineColIterator { - LineColIterator { - iter, - line: 1, - col: 0, - start_of_line: 0, - } - } - - pub fn line(&self) -> usize { - self.line - } - - pub fn col(&self) -> usize { - self.col - } - - pub fn byte_offset(&self) -> usize { - self.start_of_line + self.col - } -} - -impl Iterator for LineColIterator -where - I: Iterator>, -{ - type Item = io::Result; - - fn next(&mut self) -> Option> { - match self.iter.next() { - None => None, - Some(Ok(b'\n')) => { - self.start_of_line += self.col + 1; - self.line += 1; - self.col = 0; - Some(Ok(b'\n')) - } - Some(Ok(c)) => { - self.col += 1; - Some(Ok(c)) - } - Some(Err(e)) => Some(Err(e)), - } - } -} diff --git a/src/lib.rs b/src/lib.rs index 95242d40b..b36783dec 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -411,8 +411,6 @@ pub mod value; mod features_check; mod io; -#[cfg(feature = "std")] -mod iter; #[cfg(feature = "float_roundtrip")] mod lexical; mod number; diff --git a/src/read.rs b/src/read.rs index fc3a3ca74..1f5cfd6d9 100644 --- a/src/read.rs +++ b/src/read.rs @@ -7,8 +7,6 @@ use core::str; #[cfg(feature = "std")] use crate::io; -#[cfg(feature = "std")] -use crate::iter::LineColIterator; #[cfg(feature = "raw_value")] use crate::raw::BorrowedRawDeserializer; @@ -114,11 +112,25 @@ pub trait Read<'de>: private::Sealed { fn set_failed(&mut self, failed: &mut bool); } +#[derive(Clone, Copy)] pub struct Position { pub line: usize, pub column: usize, } +impl Position { + pub fn update(&mut self, slice: &[u8]) { + for b in slice { + if *b == b'\n' { + self.line += 1; + self.column = 0; + } else { + self.column += 1; + } + } + } +} + pub enum Reference<'b, 'c, T> where T: ?Sized + 'static, @@ -148,7 +160,9 @@ pub struct IoRead where R: io::Read, { - iter: LineColIterator>, + reader: std::io::BufReader, + pos: Position, + byte_off: usize, /// Temporary storage of peeked byte. ch: Option, #[cfg(feature = "raw_value")] @@ -191,7 +205,9 @@ where /// Create a JSON input source to read from a std::io input stream. pub fn new(reader: R) -> Self { IoRead { - iter: LineColIterator::new(reader.bytes()), + reader: std::io::BufReader::new(reader), + pos: Position { line: 1, column: 0 }, + byte_off: 0, ch: None, #[cfg(feature = "raw_value")] raw_buffer: None, @@ -239,6 +255,24 @@ where } } } + + fn next_byte(&mut self) -> Option> { + use io::Read; + let mut byte = [0]; + + loop { + return match self.reader.read_exact(&mut byte) { + Ok(()) => { + self.pos.update(&byte); + self.byte_off += 1; + Some(Ok(byte[0])) + } + Err(ref e) if e.kind() == io::ErrorKind::UnexpectedEof => None, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Some(Err(e)), + }; + } + } } #[cfg(feature = "std")] @@ -258,7 +292,7 @@ where } Ok(Some(ch)) } - None => match self.iter.next() { + None => match self.next_byte() { Some(Err(err)) => Err(Error::io(err)), Some(Ok(ch)) => { #[cfg(feature = "raw_value")] @@ -278,7 +312,7 @@ where fn peek(&mut self) -> Result> { match self.ch { Some(ch) => Ok(Some(ch)), - None => match self.iter.next() { + None => match self.next_byte() { Some(Err(err)) => Err(Error::io(err)), Some(Ok(ch)) => { self.ch = Some(ch); @@ -305,22 +339,18 @@ where } fn position(&self) -> Position { - Position { - line: self.iter.line(), - column: self.iter.col(), - } + self.pos } fn peek_position(&self) -> Position { - // The LineColIterator updates its position during peek() so it has the - // right one here. + // We update the position during peek(), so we have the right one here. self.position() } fn byte_offset(&self) -> usize { match self.ch { - Some(_) => self.iter.byte_offset() - 1, - None => self.iter.byte_offset(), + Some(_) => self.byte_off - 1, + None => self.byte_off, } } From 652c97c8d66c2269037c81e30514310476042464 Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Sat, 1 Apr 2023 15:09:59 -0700 Subject: [PATCH 2/7] Replace `BufReader` with our own buffer in `IoRead` --- src/read.rs | 60 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/src/read.rs b/src/read.rs index 1f5cfd6d9..f723cdb5d 100644 --- a/src/read.rs +++ b/src/read.rs @@ -160,11 +160,26 @@ pub struct IoRead where R: io::Read, { - reader: std::io::BufReader, + reader: R, + + /// Internal buffer. + buffer: Box<[u8; 128]>, + + /// Total number of bytes ever loaded into buffer. + buffered_total: usize, + + /// Current number of bytes loaded into buffer. + buffered_current: usize, + + /// Current number of bytes consumed from buffer. + consumed_current: usize, + + /// Position of last byte read (e.g. by `next` or `peek`). pos: Position, - byte_off: usize, + /// Temporary storage of peeked byte. ch: Option, + #[cfg(feature = "raw_value")] raw_buffer: Option>, } @@ -205,9 +220,12 @@ where /// Create a JSON input source to read from a std::io input stream. pub fn new(reader: R) -> Self { IoRead { - reader: std::io::BufReader::new(reader), + reader, + buffer: Box::new([0; 128]), + buffered_total: 0, + buffered_current: 0, + consumed_current: 0, pos: Position { line: 1, column: 0 }, - byte_off: 0, ch: None, #[cfg(feature = "raw_value")] raw_buffer: None, @@ -256,18 +274,30 @@ where } } + #[inline] fn next_byte(&mut self) -> Option> { - use io::Read; - let mut byte = [0]; + if self.consumed_current < self.buffered_current { + let byte = self.buffer[self.consumed_current]; + self.consumed_current += 1; + self.pos.update(&[byte]); + return Some(Ok(byte)); + } + self.next_byte_cold() + } + + #[inline(never)] + fn next_byte_cold(&mut self) -> Option> { loop { - return match self.reader.read_exact(&mut byte) { - Ok(()) => { - self.pos.update(&byte); - self.byte_off += 1; - Some(Ok(byte[0])) + return match self.reader.read(&mut self.buffer[..]) { + Ok(0) => None, + Ok(n) => { + self.buffered_total += n; + self.buffered_current = n; + self.consumed_current = 1; + self.pos.update(&[self.buffer[0]]); + Some(Ok(self.buffer[0])) } - Err(ref e) if e.kind() == io::ErrorKind::UnexpectedEof => None, Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, Err(e) => Some(Err(e)), }; @@ -348,9 +378,11 @@ where } fn byte_offset(&self) -> usize { + let offset = self.buffered_total - self.buffered_current + self.consumed_current; + match self.ch { - Some(_) => self.byte_off - 1, - None => self.byte_off, + Some(_) => offset - 1, + None => offset, } } From 5b7385f0960e0cbf51c1271662cff26e9deab445 Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Sun, 2 Apr 2023 08:56:09 -0700 Subject: [PATCH 3/7] Avoid updating position on each byte read in `IoRead` --- src/read.rs | 29 ++++++++++++++++++++++------- tests/stream.rs | 31 +++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/src/read.rs b/src/read.rs index f723cdb5d..2ef89391f 100644 --- a/src/read.rs +++ b/src/read.rs @@ -165,6 +165,9 @@ where /// Internal buffer. buffer: Box<[u8; 128]>, + /// Position of start of current buffer / end of previous buffer contents. + buffer_position: Position, + /// Total number of bytes ever loaded into buffer. buffered_total: usize, @@ -174,9 +177,6 @@ where /// Current number of bytes consumed from buffer. consumed_current: usize, - /// Position of last byte read (e.g. by `next` or `peek`). - pos: Position, - /// Temporary storage of peeked byte. ch: Option, @@ -222,10 +222,10 @@ where IoRead { reader, buffer: Box::new([0; 128]), + buffer_position: Position { line: 1, column: 0 }, buffered_total: 0, buffered_current: 0, consumed_current: 0, - pos: Position { line: 1, column: 0 }, ch: None, #[cfg(feature = "raw_value")] raw_buffer: None, @@ -279,7 +279,6 @@ where if self.consumed_current < self.buffered_current { let byte = self.buffer[self.consumed_current]; self.consumed_current += 1; - self.pos.update(&[byte]); return Some(Ok(byte)); } @@ -288,14 +287,18 @@ where #[inline(never)] fn next_byte_cold(&mut self) -> Option> { + // This must be done before read call, since it modifies the buffer. + let mut buffer_position = self.buffer_position; + buffer_position.update(&self.buffer[..self.buffered_current]); + loop { return match self.reader.read(&mut self.buffer[..]) { Ok(0) => None, Ok(n) => { + self.buffer_position = buffer_position; self.buffered_total += n; self.buffered_current = n; self.consumed_current = 1; - self.pos.update(&[self.buffer[0]]); Some(Ok(self.buffer[0])) } Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, @@ -369,7 +372,19 @@ where } fn position(&self) -> Position { - self.pos + let mut position = self.buffer_position; + for ch in &self.buffer[..self.consumed_current] { + match *ch { + b'\n' => { + position.line += 1; + position.column = 0; + } + _ => { + position.column += 1; + } + } + } + position } fn peek_position(&self) -> Position { diff --git a/tests/stream.rs b/tests/stream.rs index ec6b9e3d0..a7df997f8 100644 --- a/tests/stream.rs +++ b/tests/stream.rs @@ -171,6 +171,37 @@ fn test_json_stream_invalid_number() { }); } +#[test] +fn test_json_stream_expected_value() { + let data = r#"{ + "a": 10, + "b": 20, + "c": , + }"#; + + test_stream!(data, Value, |stream| { + let second = stream.next().unwrap().unwrap_err(); + assert_eq!(second.to_string(), "expected value at line 4 column 14"); + }); +} + +#[test] +fn test_json_stream_late_error_position() { + // Test that the position of an error is computed correctly when it's later in the stream. + // This is specifically intended to test the current implementation of the stream-based reader, + // which uses a 128 byte buffer. We want the error to occur after the first 128 bytes. + let data = r#"{ + "a": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + "b": "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", + "c": , + }"#; + + test_stream!(data, Value, |stream| { + let second = stream.next().unwrap().unwrap_err(); + assert_eq!(second.to_string(), "expected value at line 4 column 14"); + }); +} + #[test] fn test_error() { let data = "true wrong false"; From c201ba5eaae88dbabe9573ea84837cb183a809d6 Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Sun, 2 Apr 2023 09:23:14 -0700 Subject: [PATCH 4/7] Have `read::error` take `Position` instead of `Read` --- src/read.rs | 60 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 26 deletions(-) diff --git a/src/read.rs b/src/read.rs index 2ef89391f..905b02c54 100644 --- a/src/read.rs +++ b/src/read.rs @@ -266,7 +266,10 @@ where } _ => { if validate { - return error(self, ErrorCode::ControlCharacterWhileParsingString); + return error( + self.position(), + ErrorCode::ControlCharacterWhileParsingString, + ); } scratch.push(ch); } @@ -428,7 +431,10 @@ where tri!(ignore_escape(self)); } _ => { - return error(self, ErrorCode::ControlCharacterWhileParsingString); + return error( + self.position(), + ErrorCode::ControlCharacterWhileParsingString, + ); } } } @@ -438,7 +444,7 @@ where let mut n = 0; for _ in 0..4 { match decode_hex_val(tri!(next_or_eof(self))) { - None => return error(self, ErrorCode::InvalidEscape), + None => return error(self.position(), ErrorCode::InvalidEscape), Some(val) => { n = (n << 4) + val; } @@ -460,7 +466,7 @@ where let raw = self.raw_buffer.take().unwrap(); let raw = match String::from_utf8(raw) { Ok(raw) => raw, - Err(_) => return error(self, ErrorCode::InvalidUnicodeCodePoint), + Err(_) => return error(self.position(), ErrorCode::InvalidUnicodeCodePoint), }; visitor.visit_map(OwnedRawDeserializer { raw_value: Some(raw), @@ -526,7 +532,7 @@ impl<'a> SliceRead<'a> { self.index += 1; } if self.index == self.slice.len() { - return error(self, ErrorCode::EofWhileParsingString); + return error(self.position(), ErrorCode::EofWhileParsingString); } match self.slice[self.index] { b'"' => { @@ -551,7 +557,10 @@ impl<'a> SliceRead<'a> { _ => { self.index += 1; if validate { - return error(self, ErrorCode::ControlCharacterWhileParsingString); + return error( + self.position(), + ErrorCode::ControlCharacterWhileParsingString, + ); } } } @@ -622,7 +631,7 @@ impl<'a> Read<'a> for SliceRead<'a> { self.index += 1; } if self.index == self.slice.len() { - return error(self, ErrorCode::EofWhileParsingString); + return error(self.position(), ErrorCode::EofWhileParsingString); } match self.slice[self.index] { b'"' => { @@ -634,7 +643,10 @@ impl<'a> Read<'a> for SliceRead<'a> { tri!(ignore_escape(self)); } _ => { - return error(self, ErrorCode::ControlCharacterWhileParsingString); + return error( + self.position(), + ErrorCode::ControlCharacterWhileParsingString, + ); } } } @@ -643,7 +655,7 @@ impl<'a> Read<'a> for SliceRead<'a> { fn decode_hex_escape(&mut self) -> Result { if self.index + 4 > self.slice.len() { self.index = self.slice.len(); - return error(self, ErrorCode::EofWhileParsingString); + return error(self.position(), ErrorCode::EofWhileParsingString); } let mut n = 0; @@ -651,7 +663,7 @@ impl<'a> Read<'a> for SliceRead<'a> { let ch = decode_hex_val(self.slice[self.index]); self.index += 1; match ch { - None => return error(self, ErrorCode::InvalidEscape), + None => return error(self.position(), ErrorCode::InvalidEscape), Some(val) => { n = (n << 4) + val; } @@ -673,7 +685,7 @@ impl<'a> Read<'a> for SliceRead<'a> { let raw = &self.slice[self.raw_buffering_start_index..self.index]; let raw = match str::from_utf8(raw) { Ok(raw) => raw, - Err(_) => return error(self, ErrorCode::InvalidUnicodeCodePoint), + Err(_) => return error(self.position(), ErrorCode::InvalidUnicodeCodePoint), }; visitor.visit_map(BorrowedRawDeserializer { raw_value: Some(raw), @@ -893,7 +905,7 @@ where { match tri!(read.next()) { Some(b) => Ok(b), - None => error(read, ErrorCode::EofWhileParsingString), + None => error(read.position(), ErrorCode::EofWhileParsingString), } } @@ -903,20 +915,16 @@ where { match tri!(read.peek()) { Some(b) => Ok(b), - None => error(read, ErrorCode::EofWhileParsingString), + None => error(read.position(), ErrorCode::EofWhileParsingString), } } -fn error<'de, R, T>(read: &R, reason: ErrorCode) -> Result -where - R: ?Sized + Read<'de>, -{ - let position = read.position(); +fn error<'de, T>(position: Position, reason: ErrorCode) -> Result { Err(Error::syntax(reason, position.line, position.column)) } fn as_str<'de, 's, R: Read<'de>>(read: &R, slice: &'s [u8]) -> Result<&'s str> { - str::from_utf8(slice).or_else(|_| error(read, ErrorCode::InvalidUnicodeCodePoint)) + str::from_utf8(slice).or_else(|_| error(read.position(), ErrorCode::InvalidUnicodeCodePoint)) } /// Parses a JSON escape sequence and appends it into the scratch space. Assumes @@ -949,7 +957,7 @@ fn parse_escape<'de, R: Read<'de>>( let c = match tri!(read.decode_hex_escape()) { n @ 0xDC00..=0xDFFF => { return if validate { - error(read, ErrorCode::LoneLeadingSurrogateInHexEscape) + error(read.position(), ErrorCode::LoneLeadingSurrogateInHexEscape) } else { encode_surrogate(scratch, n); Ok(()) @@ -966,7 +974,7 @@ fn parse_escape<'de, R: Read<'de>>( } else { return if validate { read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) + error(read.position(), ErrorCode::UnexpectedEndOfHexEscape) } else { encode_surrogate(scratch, n1); Ok(()) @@ -978,7 +986,7 @@ fn parse_escape<'de, R: Read<'de>>( } else { return if validate { read.discard(); - error(read, ErrorCode::UnexpectedEndOfHexEscape) + error(read.position(), ErrorCode::UnexpectedEndOfHexEscape) } else { encode_surrogate(scratch, n1); // The \ prior to this byte started an escape sequence, @@ -993,7 +1001,7 @@ fn parse_escape<'de, R: Read<'de>>( let n2 = tri!(read.decode_hex_escape()); if n2 < 0xDC00 || n2 > 0xDFFF { - return error(read, ErrorCode::LoneLeadingSurrogateInHexEscape); + return error(read.position(), ErrorCode::LoneLeadingSurrogateInHexEscape); } let n = (((n1 - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; @@ -1001,7 +1009,7 @@ fn parse_escape<'de, R: Read<'de>>( match char::from_u32(n) { Some(c) => c, None => { - return error(read, ErrorCode::InvalidUnicodeCodePoint); + return error(read.position(), ErrorCode::InvalidUnicodeCodePoint); } } } @@ -1014,7 +1022,7 @@ fn parse_escape<'de, R: Read<'de>>( scratch.extend_from_slice(c.encode_utf8(&mut [0_u8; 4]).as_bytes()); } _ => { - return error(read, ErrorCode::InvalidEscape); + return error(read.position(), ErrorCode::InvalidEscape); } } @@ -1041,7 +1049,7 @@ where tri!(read.decode_hex_escape()); } _ => { - return error(read, ErrorCode::InvalidEscape); + return error(read.position(), ErrorCode::InvalidEscape); } } From a16902cc55988f4455b3a715b79efe2617731184 Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Sun, 2 Apr 2023 12:45:47 -0700 Subject: [PATCH 5/7] Avoid copying into scratch space when possible --- src/read.rs | 118 ++++++++++++++++++++++++++++++++++++++++++++---- tests/stream.rs | 76 +++++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+), 9 deletions(-) diff --git a/src/read.rs b/src/read.rs index 905b02c54..4eaf6bef8 100644 --- a/src/read.rs +++ b/src/read.rs @@ -251,20 +251,59 @@ where T: 's, F: FnOnce(&'s Self, &'s [u8]) -> Result, { - loop { - let ch = tri!(next_or_eof(self)); - if !ESCAPE[ch as usize] { - scratch.push(ch); - continue; + // Fill buffer if necessary. + if self.consumed_current == self.buffered_current { + match self.fill_buffer() { + Some(Ok(())) => {} + Some(Err(e)) => return Err(Error::io(e)), + None => return error(self.position(), ErrorCode::EofWhileParsingString), } - match ch { - b'"' => { + } + + let mut start = self.consumed_current; + + // Find next escape character in buffer if one exists. + let mut escape = match self.buffer[start..self.buffered_current] + .iter() + .position(|&ch| ESCAPE[ch as usize]) + { + Some(idx) => { + // Compensate index for slicing. + let idx = start + idx; + let ch = self.buffer[idx]; + + // We've consumed up through this character. + // Bytes consumed is index of character plus one. + self.consumed_current = idx + 1; + + if ch == b'"' { + // Found end of string before end of buffer or any character that would + // necessitate using the scratch buffer. Can return directly from our buffer. + return result(self, &self.buffer[start..self.consumed_current - 1]); + } + + // Found an escape character. Must copy string into scratch and apply escaping. + scratch.extend_from_slice(&self.buffer[start..self.consumed_current - 1]); + Some(ch) + } + None => { + // Found end of buffer before finding end of string or a character requiring + // escaping. Can't return string from our buffer. Must copy string into scratch. + self.consumed_current = self.buffered_current; + scratch.extend_from_slice(&self.buffer[start..self.consumed_current]); + None + } + }; + + loop { + match escape.take() { + Some(b'"') => { return result(self, scratch); } - b'\\' => { + Some(b'\\') => { tri!(parse_escape(self, validate, scratch)); } - _ => { + Some(ch) => { if validate { return error( self.position(), @@ -273,6 +312,46 @@ where } scratch.push(ch); } + None => { + // Fill buffer if necessary. + if self.consumed_current == self.buffered_current { + match self.fill_buffer() { + Some(Ok(())) => {} + Some(Err(e)) => return Err(Error::io(e)), + None => { + return error(self.position(), ErrorCode::EofWhileParsingString) + } + } + } + + start = self.consumed_current; + + // Find next escape character in buffer if one exists. + escape = match self.buffer[start..self.buffered_current] + .iter() + .position(|&ch| ESCAPE[ch as usize]) + { + Some(idx) => { + // Compensate index for slicing. + let idx = start + idx; + let ch = self.buffer[idx]; + + // We've consumed up through this character. + // Bytes consumed is index of character plus one. + self.consumed_current = idx + 1; + scratch + .extend_from_slice(&self.buffer[start..self.consumed_current - 1]); + Some(ch) + } + None => { + // Found end of buffer before finding end of string or a character + // requiring escaping. + self.consumed_current = self.buffered_current; + scratch.extend_from_slice(&self.buffer[start..self.consumed_current]); + None + } + }; + } } } } @@ -309,6 +388,27 @@ where }; } } + + fn fill_buffer(&mut self) -> Option> { + // This must be done before read call, since it modifies the buffer. + let mut buffer_position = self.buffer_position; + buffer_position.update(&self.buffer[..self.buffered_current]); + + loop { + return match self.reader.read(&mut self.buffer[..]) { + Ok(0) => None, + Ok(n) => { + self.buffer_position = buffer_position; + self.buffered_total += n; + self.buffered_current = n; + self.consumed_current = 0; + Some(Ok(())) + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Some(Err(e)), + }; + } + } } #[cfg(feature = "std")] diff --git a/tests/stream.rs b/tests/stream.rs index a7df997f8..371f4f6f6 100644 --- a/tests/stream.rs +++ b/tests/stream.rs @@ -202,6 +202,82 @@ fn test_json_stream_late_error_position() { }); } +#[test] +fn test_json_stream_long_string() { + // This is specifically intended to test the current implementation of the stream-based reader, + // which uses a 128 byte buffer. We want to test parsing long strings with escapes at various + // points before and after the buffer length. + let data = "{\"x\":\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\ + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\",\"y\":\"b\"}"; + + test_stream!(data, Value, |stream| { + let obj = stream.next().unwrap().unwrap(); + assert_eq!( + obj["x"], + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\ + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + ); + assert_eq!(obj["y"], "b"); + assert!(stream.next().is_none()); + }); + + let data = "{\"x\":\"\\raaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\ + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\",\"y\":\"b\"}"; + + test_stream!(data, Value, |stream| { + let obj = stream.next().unwrap().unwrap(); + assert_eq!( + obj["x"], + "\raaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\ + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + ); + assert_eq!(obj["y"], "b"); + assert!(stream.next().is_none()); + }); + + let data = "{\"x\":\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\raaaaaa\ + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\",\"y\":\"b\"}"; + + test_stream!(data, Value, |stream| { + let obj = stream.next().unwrap().unwrap(); + assert_eq!( + obj["x"], + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\raaaaaa\ + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + ); + assert_eq!(obj["y"], "b"); + assert!(stream.next().is_none()); + }); + + let data = "{\"x\":\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\ + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\raaaaaa\",\"y\":\"b\"}"; + + test_stream!(data, Value, |stream| { + let obj = stream.next().unwrap().unwrap(); + assert_eq!( + obj["x"], + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\ + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\raaaaaa" + ); + assert_eq!(obj["y"], "b"); + assert!(stream.next().is_none()); + }); + + let data = "{\"x\":\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\ + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\\r\",\"y\":\"b\"}"; + + test_stream!(data, Value, |stream| { + let obj = stream.next().unwrap().unwrap(); + assert_eq!( + obj["x"], + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\ + aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\r" + ); + assert_eq!(obj["y"], "b"); + assert!(stream.next().is_none()); + }); +} + #[test] fn test_error() { let data = "true wrong false"; From 77c1a790bd63e31cf1e6fc771c82914aea2fae6f Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Tue, 4 Apr 2023 20:26:00 -0700 Subject: [PATCH 6/7] Use `Position::update` in more places --- src/read.rs | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/src/read.rs b/src/read.rs index 4eaf6bef8..2555d2cfc 100644 --- a/src/read.rs +++ b/src/read.rs @@ -476,17 +476,7 @@ where fn position(&self) -> Position { let mut position = self.buffer_position; - for ch in &self.buffer[..self.consumed_current] { - match *ch { - b'\n' => { - position.line += 1; - position.column = 0; - } - _ => { - position.column += 1; - } - } - } + position.update(&self.buffer[..self.consumed_current]); position } @@ -597,17 +587,7 @@ impl<'a> SliceRead<'a> { fn position_of_index(&self, i: usize) -> Position { let mut position = Position { line: 1, column: 0 }; - for ch in &self.slice[..i] { - match *ch { - b'\n' => { - position.line += 1; - position.column = 0; - } - _ => { - position.column += 1; - } - } - } + position.update(&self.slice[..i]); position } From 7299551061df340111a32eac40485023f52117e4 Mon Sep 17 00:00:00 2001 From: Tyson Nottingham Date: Tue, 4 Apr 2023 20:30:02 -0700 Subject: [PATCH 7/7] Clarify comment --- src/read.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/read.rs b/src/read.rs index 2555d2cfc..2db8a9c0c 100644 --- a/src/read.rs +++ b/src/read.rs @@ -369,7 +369,7 @@ where #[inline(never)] fn next_byte_cold(&mut self) -> Option> { - // This must be done before read call, since it modifies the buffer. + // This must be done before read call, since read modifies the buffer. let mut buffer_position = self.buffer_position; buffer_position.update(&self.buffer[..self.buffered_current]); @@ -390,7 +390,7 @@ where } fn fill_buffer(&mut self) -> Option> { - // This must be done before read call, since it modifies the buffer. + // This must be done before read call, since read modifies the buffer. let mut buffer_position = self.buffer_position; buffer_position.update(&self.buffer[..self.buffered_current]);