From c8fdfacafb7324444b13eb67c270e1b6abee4203 Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Mon, 1 Apr 2019 11:09:57 -0700
Subject: [PATCH] Add warnings about UTF-16 vs UTF-8 strings

This commit aims to address #1348 via a number of strategies:

* Documentation is updated to warn about UTF-16 vs UTF-8 problems
  between JS and Rust. Notably documenting that `as_string` and handling
  of arguments is lossy when there are lone surrogates.

* A `JsString::is_valid_utf16` method was added to test whether
  `as_string` is lossless or not.

The intention is that most default behavior of `wasm-bindgen` will
remain, but where necessary bindings will use `JsString` instead of
`str`/`String` and will manually check for `is_valid_utf16` as
necessary. It's also hypothesized that this is relatively rare and not
too performance critical, so an optimized intrinsic for `is_valid_utf16`
is not yet provided.

Closes #1348
---
 crates/js-sys/src/lib.rs             | 29 ++++++++++++++++++++++++++++
 crates/js-sys/tests/wasm/JsString.rs | 12 ++++++++++++
 guide/src/reference/types/str.md     | 22 +++++++++++++++++++++
 guide/src/reference/types/string.md  |  3 +++
 src/lib.rs                           | 22 +++++++++++++++++++++
 5 files changed, 88 insertions(+)
diff --git a/crates/js-sys/src/lib.rs b/crates/js-sys/src/lib.rs
index 85fcab47ce7..d8c8daecfa7 100644
--- a/crates/js-sys/src/lib.rs
+++ b/crates/js-sys/src/lib.rs
@@ -3522,6 +3522,35 @@ impl JsString {
             None
         }
     }
+
+    /// Returns whether this string is a valid UTF-16 string.
+    ///
+    /// This is useful for learning whether `as_string()` will return a lossless
+    /// representation of the JS string. If this string contains unparied
+    /// surrogates then `as_string()` will succeed but it will be a lossy
+    /// representation of the JS string because lone surrogates will become
+    /// replacement characters.
+    ///
+    /// If this function returns `false` then to get a lossless representation
+    /// of the string you'll need to manually use `char_code_at` accessor to
+    /// access the raw code points.
+    ///
+    /// For more information, see the documentation on [JS strings vs Rust
+    /// strings][docs]
+    ///
+    /// [docs]: https://rustwasm.github.io/docs/wasm-bindgen/reference/types/str.html
+    pub fn is_valid_utf16(&self) -> bool {
+        std::char::decode_utf16(self.iter()).all(|i| i.is_ok())
+    }
+
+    /// Returns an iterator over the u16 character codes that make up this JS
+    /// string.
+    ///
+    /// This method will call `char_code_at` for each code in this JS string,
+    /// returning an iterator of the codes in sequence.
+    pub fn iter<'a>(&'a self) -> impl ExactSizeIterator<Item = u16> + 'a {
+        (0..self.length()).map(move |i| self.char_code_at(i) as u16)
+    }
 }
 
 impl PartialEq<str> for JsString {
diff --git a/crates/js-sys/tests/wasm/JsString.rs b/crates/js-sys/tests/wasm/JsString.rs
index bb4a6ac0510..c7f229f1613 100644
--- a/crates/js-sys/tests/wasm/JsString.rs
+++ b/crates/js-sys/tests/wasm/JsString.rs
@@ -541,3 +541,15 @@ fn raw() {
     );
     assert!(JsString::raw_0(&JsValue::null().unchecked_into()).is_err());
 }
+
+#[wasm_bindgen_test]
+fn is_valid_utf16() {
+    assert!(JsString::from("a").is_valid_utf16());
+    assert!(JsString::from("").is_valid_utf16());
+    assert!(JsString::from("🥑").is_valid_utf16());
+    assert!(JsString::from("Why hello there this, 🥑, is 🥑 and is 🥑").is_valid_utf16());
+
+    assert!(JsString::from_char_code1(0x00).is_valid_utf16());
+    assert!(!JsString::from_char_code1(0xd800).is_valid_utf16());
+    assert!(!JsString::from_char_code1(0xdc00).is_valid_utf16());
+}
diff --git a/guide/src/reference/types/str.md b/guide/src/reference/types/str.md
index 999bbc183f9..86413a464ae 100644
--- a/guide/src/reference/types/str.md
+++ b/guide/src/reference/types/str.md
@@ -20,3 +20,25 @@ with handles to JavaScript string values, use the `js_sys::JsString` type.
 ```js
 {{#include ../../../../examples/guide-supported-types-examples/str.js}}
 ```
+
+## UTF-16 vs UTF-8
+
+Strings in JavaScript are encoded as UTF-16, but with one major exception: they
+can contain unpaired surrogates. For some unicode characters UTF-16 usese two
+16-byte values.  These are called "surrogate pairs" because they always come in
+pairs. In JavaScript, it is possible for these surrogate pairs to be missing the
+other half, creating an "unpaired surrogate".
+
+When passing a string from JavaScript to Rust, it uses the `TextEncoder` API to
+convert from UTF-16 to UTF-8. This is normally perfectly fine... unless there
+are unpaired surrogates. In that case it will replace the unpaired surrogates
+with U+FFFD (�, the replacement character). That means the string in Rust is
+now different from the string in JavaScript!
+
+If you want to guarantee that the Rust string is the same as the JavaScript
+string, you should instead use `js_sys::JsString` (which keeps the string in
+JavaScript and doesn't copy it into Rust).
+
+If you simply want to ignore strings which contain unpaired surrogates, you can
+use `JsString::is_valid_utf16` to test whether the string contains unpaired
+surrogates or not.
diff --git a/guide/src/reference/types/string.md b/guide/src/reference/types/string.md
index 568e20b63e1..3b846704abf 100644
--- a/guide/src/reference/types/string.md
+++ b/guide/src/reference/types/string.md
@@ -8,6 +8,9 @@ Copies the string's contents back and forth between the JavaScript
 garbage-collected heap and the Wasm linear memory with `TextDecoder` and
 `TextEncoder`
 
+> **Note**: Be sure to check out the [documentation for `str`](str.html) to
+> learn about some caveats when working with strings between JS and Rust.
+
 ## Example Rust Usage
 
 ```rust
diff --git a/src/lib.rs b/src/lib.rs
index 0cd5034b812..02b3013e66f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -260,6 +260,28 @@ impl JsValue {
     ///
     /// If this JS value is not an instance of a string or if it's not valid
     /// utf-8 then this returns `None`.
+    ///
+    /// # UTF-16 vs UTF-8
+    ///
+    /// Strings in JavaScript are encoded as UTF-16, but with one major
+    /// exception: they can contain unpaired surrogates. For some unicode
+    /// characters UTF-16 usese two 16-byte values.  These are called "surrogate
+    /// pairs" because they always come in pairs. In JavaScript, it is possible
+    /// for these surrogate pairs to be missing the other half, creating an
+    /// "unpaired surrogate".
+    ///
+    /// This method internally will use `TextEncoder` in JS to convert from
+    /// UTF-16 to UTF-8. This is normally perfectly fine...  unless there are
+    /// unpaired surrogates. In that case it will replace the unpaired
+    /// surrogates with U+FFFD (�, the replacement character). That means the
+    /// string returned from this function is actually different from the string
+    /// in JS!
+    ///
+    /// If you want to guarantee that the returned string is the same as the
+    /// source string, you'll need to use `js_sys::JsString::is_valid_utf16`. If
+    /// that function returns `true` then this function will be lossless. If the
+    /// string isn't valid utf-16, however, then you'll need to use
+    /// `JsString::iter` and retain the u16 values exactly.
     #[cfg(feature = "std")]
     pub fn as_string(&self) -> Option<String> {
         unsafe {