Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 81 additions & 26 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,37 @@ impl<'a> Deserializer<'a> {
loop {
match self.peek() {
Some(b'"') => {
let end = self.index;
self.eat_char();
return str::from_utf8(&self.slice[start..end])
.map_err(|_| Error::InvalidUnicodeCodePoint);
// Counts the number of backslashes in front of the current index.
//
// "some string with \\\" included."
// ^^^^^
// |||||
// loop run: 4321|
// |
// `index`
//
// Since we only get in this code branch if we found a " starting the string and `index` is greater
// than the start position, we know the loop will end no later than this point.
let leading_backslashes = |index: usize| -> usize {
let mut count = 0;
loop {
if self.slice[index - count - 1] == b'\\' {
count += 1;
} else {
return count;
}
}
};

let is_escaped = leading_backslashes(self.index) % 2 == 1;
if is_escaped {
self.eat_char(); // just continue
} else {
let end = self.index;
self.eat_char();
return str::from_utf8(&self.slice[start..end])
.map_err(|_| Error::InvalidUnicodeCodePoint);
}
}
Some(_) => self.eat_char(),
None => return Err(Error::EofWhileParsingString),
Expand Down Expand Up @@ -745,6 +772,34 @@ mod tests {
#[test]
fn str() {
assert_eq!(crate::from_str(r#" "hello" "#), Ok("hello"));
assert_eq!(crate::from_str(r#" "" "#), Ok(""));
assert_eq!(crate::from_str(r#" " " "#), Ok(" "));
assert_eq!(crate::from_str(r#" "👏" "#), Ok("👏"));

// no unescaping is done (as documented as a known issue in lib.rs)
assert_eq!(crate::from_str(r#" "hel\tlo" "#), Ok("hel\\tlo"));
assert_eq!(crate::from_str(r#" "hello \\" "#), Ok("hello \\\\"));

// escaped " in the string content
assert_eq!(crate::from_str(r#" "foo\"bar" "#), Ok(r#"foo\"bar"#));
assert_eq!(crate::from_str(r#" "foo\\\"bar" "#), Ok(r#"foo\\\"bar"#));
assert_eq!(crate::from_str(r#" "foo\"\"bar" "#), Ok(r#"foo\"\"bar"#));
assert_eq!(crate::from_str(r#" "\"bar" "#), Ok(r#"\"bar"#));
assert_eq!(crate::from_str(r#" "foo\"" "#), Ok(r#"foo\""#));
assert_eq!(crate::from_str(r#" "\"" "#), Ok(r#"\""#));

// non-excaped " preceded by backslashes
assert_eq!(crate::from_str(r#" "foo bar\\" "#), Ok(r#"foo bar\\"#));
assert_eq!(crate::from_str(r#" "foo bar\\\\" "#), Ok(r#"foo bar\\\\"#));
assert_eq!(
crate::from_str(r#" "foo bar\\\\\\" "#),
Ok(r#"foo bar\\\\\\"#)
);
assert_eq!(
crate::from_str(r#" "foo bar\\\\\\\\" "#),
Ok(r#"foo bar\\\\\\\\"#)
);
assert_eq!(crate::from_str(r#" "\\" "#), Ok(r#"\\"#));
}

#[test]
Expand Down Expand Up @@ -1029,28 +1084,28 @@ mod tests {
assert_eq!(
crate::from_str::<Thing<'_>>(
r#"
{
"type": "thing",
"properties": {
"temperature": {
"type": "number",
"unit": "celsius",
"description": "An ambient temperature sensor",
"href": "/properties/temperature"
},
"humidity": {
"type": "number",
"unit": "percent",
"href": "/properties/humidity"
},
"led": {
"type": "boolean",
"description": "A red LED",
"href": "/properties/led"
}
}
}
"#
{
"type": "thing",
"properties": {
"temperature": {
"type": "number",
"unit": "celsius",
"description": "An ambient temperature sensor",
"href": "/properties/temperature"
},
"humidity": {
"type": "number",
"unit": "percent",
"href": "/properties/humidity"
},
"led": {
"type": "boolean",
"description": "A red LED",
"href": "/properties/led"
}
}
}
"#
),
Ok(Thing {
properties: Properties {
Expand Down
102 changes: 101 additions & 1 deletion src/ser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,20 @@ macro_rules! serialize_fmt {
}};
}

/// Upper-case hex for value in 0..16, encoded as ASCII bytes
fn hex_4bit(c: u8) -> u8 {
if c <= 9 {
0x30 + c
} else {
0x41 + (c - 10)
}
}

/// Upper-case hex for value in 0..256, encoded as ASCII bytes
fn hex(c: u8) -> (u8, u8) {
(hex_4bit(c >> 4), hex_4bit(c & 0x0F))
}

impl<'a, B> ser::Serializer for &'a mut Serializer<B>
where
B: heapless::ArrayLength<u8>,
Expand Down Expand Up @@ -212,7 +226,66 @@ where

fn serialize_str(self, v: &str) -> Result<Self::Ok> {
self.buf.push(b'"')?;
self.buf.extend_from_slice(v.as_bytes())?;


// Do escaping according to "6. MUST represent all strings (including object member names) in
// their minimal-length UTF-8 encoding": https://gibson042.github.io/canonicaljson-spec/
//
// We don't need to escape lone surrogates because surrogate pairs do not exist in valid UTF-8,
// even if they can exist in JSON or JavaScript strings (UCS-2 based). As a result, lone surrogates
// cannot exist in a Rust String. If they do, the bug is in the String constructor.
// An excellent explanation is available at https://www.youtube.com/watch?v=HhIEDWmQS3w

// Temporary storage for encoded a single char.
// A char is up to 4 bytes long wehn encoded to UTF-8.
let mut encoding_tmp = [0u8; 4];

for c in v.chars() {
match c {
'\\' => {
self.buf.push(b'\\')?;
self.buf.push(b'\\')?;
}
'"' => {
self.buf.push(b'\\')?;
self.buf.push(b'"')?;
}
'\u{0008}' => {
self.buf.push(b'\\')?;
self.buf.push(b'b')?;
}
'\u{0009}' => {
self.buf.push(b'\\')?;
self.buf.push(b't')?;
}
'\u{000A}' => {
self.buf.push(b'\\')?;
self.buf.push(b'n')?;
}
'\u{000C}' => {
self.buf.push(b'\\')?;
self.buf.push(b'f')?;
}
'\u{000D}' => {
self.buf.push(b'\\')?;
self.buf.push(b'r')?;
}
'\u{0000}'..='\u{001F}' => {
self.buf.push(b'\\')?;
self.buf.push(b'u')?;
self.buf.push(b'0')?;
self.buf.push(b'0')?;
let (hex1, hex2) = hex(c as u8);
self.buf.push(hex1)?;
self.buf.push(hex2)?;
}
_ => {
let encoded = c.encode_utf8(&mut encoding_tmp as &mut [u8]);
self.buf.extend_from_slice(encoded.as_bytes())?;
}
}
}

self.buf.push(b'"')?;
Ok(())
}
Expand Down Expand Up @@ -472,6 +545,33 @@ mod tests {
#[test]
fn str() {
assert_eq!(&*crate::to_string::<N, _>("hello").unwrap(), r#""hello""#);
assert_eq!(&*crate::to_string::<N, _>("").unwrap(), r#""""#);

// Characters unescaped if possible
assert_eq!(&*crate::to_string::<N, _>("ä").unwrap(), r#""ä""#);
assert_eq!(&*crate::to_string::<N, _>("৬").unwrap(), r#""৬""#);
// assert_eq!(&*crate::to_string::<N, _>("\u{A0}").unwrap(), r#"" ""#); // non-breaking space
assert_eq!(&*crate::to_string::<N, _>("ℝ").unwrap(), r#""ℝ""#); // 3 byte character
assert_eq!(&*crate::to_string::<N, _>("💣").unwrap(), r#""💣""#); // 4 byte character

// " and \ must be escaped
assert_eq!(&*crate::to_string::<N, _>("foo\"bar").unwrap(), r#""foo\"bar""#);
assert_eq!(&*crate::to_string::<N, _>("foo\\bar").unwrap(), r#""foo\\bar""#);

// \b, \t, \n, \f, \r must be escaped in their two-character escaping
assert_eq!(&*crate::to_string::<N, _>(" \u{0008} ").unwrap(), r#"" \b ""#);
assert_eq!(&*crate::to_string::<N, _>(" \u{0009} ").unwrap(), r#"" \t ""#);
assert_eq!(&*crate::to_string::<N, _>(" \u{000A} ").unwrap(), r#"" \n ""#);
assert_eq!(&*crate::to_string::<N, _>(" \u{000C} ").unwrap(), r#"" \f ""#);
assert_eq!(&*crate::to_string::<N, _>(" \u{000D} ").unwrap(), r#"" \r ""#);

// U+0000 through U+001F is escaped using six-character \u00xx uppercase hexadecimal escape sequences
assert_eq!(&*crate::to_string::<N, _>(" \u{0000} ").unwrap(), r#"" \u0000 ""#);
assert_eq!(&*crate::to_string::<N, _>(" \u{0001} ").unwrap(), r#"" \u0001 ""#);
assert_eq!(&*crate::to_string::<N, _>(" \u{0007} ").unwrap(), r#"" \u0007 ""#);
assert_eq!(&*crate::to_string::<N, _>(" \u{000e} ").unwrap(), r#"" \u000E ""#);
assert_eq!(&*crate::to_string::<N, _>(" \u{001D} ").unwrap(), r#"" \u001D ""#);
assert_eq!(&*crate::to_string::<N, _>(" \u{001f} ").unwrap(), r#"" \u001F ""#);
}

#[test]
Expand Down