From c8fdfacafb7324444b13eb67c270e1b6abee4203 Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Mon, 1 Apr 2019 11:09:57 -0700 Subject: [PATCH] Add warnings about UTF-16 vs UTF-8 strings This commit aims to address #1348 via a number of strategies: * Documentation is updated to warn about UTF-16 vs UTF-8 problems between JS and Rust. Notably documenting that `as_string` and handling of arguments is lossy when there are lone surrogates. * A `JsString::is_valid_utf16` method was added to test whether `as_string` is lossless or not. The intention is that most default behavior of `wasm-bindgen` will remain, but where necessary bindings will use `JsString` instead of `str`/`String` and will manually check for `is_valid_utf16` as necessary. It's also hypothesized that this is relatively rare and not too performance critical, so an optimized intrinsic for `is_valid_utf16` is not yet provided. Closes #1348 --- crates/js-sys/src/lib.rs | 29 ++++++++++++++++++++++++++++ crates/js-sys/tests/wasm/JsString.rs | 12 ++++++++++++ guide/src/reference/types/str.md | 22 +++++++++++++++++++++ guide/src/reference/types/string.md | 3 +++ src/lib.rs | 22 +++++++++++++++++++++ 5 files changed, 88 insertions(+) diff --git a/crates/js-sys/src/lib.rs b/crates/js-sys/src/lib.rs index 85fcab47ce7..d8c8daecfa7 100644 --- a/crates/js-sys/src/lib.rs +++ b/crates/js-sys/src/lib.rs @@ -3522,6 +3522,35 @@ impl JsString { None } } + + /// Returns whether this string is a valid UTF-16 string. + /// + /// This is useful for learning whether `as_string()` will return a lossless + /// representation of the JS string. If this string contains unparied + /// surrogates then `as_string()` will succeed but it will be a lossy + /// representation of the JS string because lone surrogates will become + /// replacement characters. + /// + /// If this function returns `false` then to get a lossless representation + /// of the string you'll need to manually use `char_code_at` accessor to + /// access the raw code points. + /// + /// For more information, see the documentation on [JS strings vs Rust + /// strings][docs] + /// + /// [docs]: https://rustwasm.github.io/docs/wasm-bindgen/reference/types/str.html + pub fn is_valid_utf16(&self) -> bool { + std::char::decode_utf16(self.iter()).all(|i| i.is_ok()) + } + + /// Returns an iterator over the u16 character codes that make up this JS + /// string. + /// + /// This method will call `char_code_at` for each code in this JS string, + /// returning an iterator of the codes in sequence. + pub fn iter<'a>(&'a self) -> impl ExactSizeIterator + 'a { + (0..self.length()).map(move |i| self.char_code_at(i) as u16) + } } impl PartialEq for JsString { diff --git a/crates/js-sys/tests/wasm/JsString.rs b/crates/js-sys/tests/wasm/JsString.rs index bb4a6ac0510..c7f229f1613 100644 --- a/crates/js-sys/tests/wasm/JsString.rs +++ b/crates/js-sys/tests/wasm/JsString.rs @@ -541,3 +541,15 @@ fn raw() { ); assert!(JsString::raw_0(&JsValue::null().unchecked_into()).is_err()); } + +#[wasm_bindgen_test] +fn is_valid_utf16() { + assert!(JsString::from("a").is_valid_utf16()); + assert!(JsString::from("").is_valid_utf16()); + assert!(JsString::from("🥑").is_valid_utf16()); + assert!(JsString::from("Why hello there this, 🥑, is 🥑 and is 🥑").is_valid_utf16()); + + assert!(JsString::from_char_code1(0x00).is_valid_utf16()); + assert!(!JsString::from_char_code1(0xd800).is_valid_utf16()); + assert!(!JsString::from_char_code1(0xdc00).is_valid_utf16()); +} diff --git a/guide/src/reference/types/str.md b/guide/src/reference/types/str.md index 999bbc183f9..86413a464ae 100644 --- a/guide/src/reference/types/str.md +++ b/guide/src/reference/types/str.md @@ -20,3 +20,25 @@ with handles to JavaScript string values, use the `js_sys::JsString` type. ```js {{#include ../../../../examples/guide-supported-types-examples/str.js}} ``` + +## UTF-16 vs UTF-8 + +Strings in JavaScript are encoded as UTF-16, but with one major exception: they +can contain unpaired surrogates. For some unicode characters UTF-16 usese two +16-byte values. These are called "surrogate pairs" because they always come in +pairs. In JavaScript, it is possible for these surrogate pairs to be missing the +other half, creating an "unpaired surrogate". + +When passing a string from JavaScript to Rust, it uses the `TextEncoder` API to +convert from UTF-16 to UTF-8. This is normally perfectly fine... unless there +are unpaired surrogates. In that case it will replace the unpaired surrogates +with U+FFFD (�, the replacement character). That means the string in Rust is +now different from the string in JavaScript! + +If you want to guarantee that the Rust string is the same as the JavaScript +string, you should instead use `js_sys::JsString` (which keeps the string in +JavaScript and doesn't copy it into Rust). + +If you simply want to ignore strings which contain unpaired surrogates, you can +use `JsString::is_valid_utf16` to test whether the string contains unpaired +surrogates or not. diff --git a/guide/src/reference/types/string.md b/guide/src/reference/types/string.md index 568e20b63e1..3b846704abf 100644 --- a/guide/src/reference/types/string.md +++ b/guide/src/reference/types/string.md @@ -8,6 +8,9 @@ Copies the string's contents back and forth between the JavaScript garbage-collected heap and the Wasm linear memory with `TextDecoder` and `TextEncoder` +> **Note**: Be sure to check out the [documentation for `str`](str.html) to +> learn about some caveats when working with strings between JS and Rust. + ## Example Rust Usage ```rust diff --git a/src/lib.rs b/src/lib.rs index 0cd5034b812..02b3013e66f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -260,6 +260,28 @@ impl JsValue { /// /// If this JS value is not an instance of a string or if it's not valid /// utf-8 then this returns `None`. + /// + /// # UTF-16 vs UTF-8 + /// + /// Strings in JavaScript are encoded as UTF-16, but with one major + /// exception: they can contain unpaired surrogates. For some unicode + /// characters UTF-16 usese two 16-byte values. These are called "surrogate + /// pairs" because they always come in pairs. In JavaScript, it is possible + /// for these surrogate pairs to be missing the other half, creating an + /// "unpaired surrogate". + /// + /// This method internally will use `TextEncoder` in JS to convert from + /// UTF-16 to UTF-8. This is normally perfectly fine... unless there are + /// unpaired surrogates. In that case it will replace the unpaired + /// surrogates with U+FFFD (�, the replacement character). That means the + /// string returned from this function is actually different from the string + /// in JS! + /// + /// If you want to guarantee that the returned string is the same as the + /// source string, you'll need to use `js_sys::JsString::is_valid_utf16`. If + /// that function returns `true` then this function will be lossless. If the + /// string isn't valid utf-16, however, then you'll need to use + /// `JsString::iter` and retain the u16 values exactly. #[cfg(feature = "std")] pub fn as_string(&self) -> Option { unsafe {