Add unicode escaping in resolver (#84)

* Add unicode escaping in resolver * Switch to Cow
projectfluent · Feb 1, 2019 · ff5b216 · ff5b216
1 parent 8067752
commit ff5b216
Show file tree

Hide file tree

Showing 9 changed files with 126 additions and 74 deletions.
diff --git a/fluent-bundle/benches/resolver.rs b/fluent-bundle/benches/resolver.rs
@@ -39,7 +39,7 @@ fn get_ids(res: &FluentResource) -> Vec<String> {
 }
 
 fn resolver_bench(c: &mut Criterion) {
-    let tests = &["simple", "menubar"];
+    let tests = &["simple", "menubar", "unescape"];
     let ftl_strings = get_strings(tests);
 
     c.bench_function_over_inputs(

diff --git a/fluent-bundle/benches/unescape.ftl b/fluent-bundle/benches/unescape.ftl
@@ -0,0 +1,9 @@
+face-with-tears-of-joy = 😂
+tetragram-for-centre = 𝌆
+
+surrogates-in-text = \uD83D\uDE02
+surrogates-in-string = {"\uD83D\uDE02"}
+surrogates-in-adjacent-strings = {"\uD83D"}{"\uDE02"}
+
+emoji-in-text = A face 😂 with tears of joy.
+emoji-in-string = {"A face 😂 with tears of joy."}
diff --git a/fluent-bundle/src/resolve.rs b/fluent-bundle/src/resolve.rs
@@ -15,6 +15,7 @@ use super::bundle::FluentBundle;
 use super::entry::GetEntry;
 use super::types::FluentValue;
 use fluent_syntax::ast;
+use fluent_syntax::unicode::unescape_unicode;
 
 #[derive(Debug, PartialEq)]
 pub enum ResolverError {
@@ -176,8 +177,7 @@ impl<'source> ResolveValue for ast::InlineExpression<'source> {
     fn to_value(&self, env: &Env) -> Result<FluentValue, ResolverError> {
         match self {
             ast::InlineExpression::StringLiteral { raw } => {
-                // XXX: We need to decode the raw into unicode here.
-                Ok(FluentValue::from(*raw))
+                Ok(FluentValue::from(unescape_unicode(raw).into_owned()))
             }
             ast::InlineExpression::NumberLiteral { value } => {
                 Ok(FluentValue::as_number(*value).unwrap())

diff --git a/fluent-syntax/benches/parser.rs b/fluent-syntax/benches/parser.rs
@@ -7,6 +7,7 @@ use std::io;
 use std::io::Read;
 
 use fluent_syntax::parser::parse;
+use fluent_syntax::unicode::unescape_unicode;
 
 fn read_file(path: &str) -> Result<String, io::Error> {
     let mut f = File::open(path)?;
@@ -38,5 +39,32 @@ fn parser_bench(c: &mut Criterion) {
     );
 }
 
-criterion_group!(benches, parser_bench);
+fn unicode_unescape_bench(c: &mut Criterion) {
+    let strings = &[
+        "foo",
+        "This is an example value",
+        "Hello \\u00e3\\u00e9 World",
+        "\\u004c\\u006f\\u0072\\u0065\\u006d \\u0069\\u0070\\u0073\\u0075\\u006d \\u0064\\u006f\\u006c\\u006f\\u0072 \\u0073\\u0069\\u0074 \\u0061\\u006d\\u0065\\u0074",
+        "Let me introduce \\\"The\\\" Fluent",
+        "And here's an example of \\\\ a character to be escaped",
+        "But this message is completely unescape free",
+        "And so is this one",
+        "Maybe this one is as well completely escape free",
+        "Welcome to Mozilla Firefox",
+        "\\u0054\\u0068\\u0065\\u0073\\u0065 \\u0073\\u0065\\u0074\\u0074\\u0069\\u006e\\u0067\\u0073 \\u0061\\u0072\\u0065 \\u0074\\u0061\\u0069\\u006c\\u006f\\u0072\\u0065\\u0064 \\u0074\\u006f \\u0079\\u006f\\u0075\\u0072 \\u0063\\u006f\\u006d\\u0070\\u0075\\u0074\\u0065\\u0072\\u2019\\u0073 \\u0068\\u0061\\u0072\\u0064\\u0077\\u0061\\u0072\\u0065 \\u0061\\u006e\\u0064 \\u006f\\u0070\\u0065\\u0072\\u0061\\u0074\\u0069\\u006e\\u0067 \\u0073\\u0079\\u0073\\u0074\\u0065\\u006d\\u002e",
+        "These settings are tailored to your computer’s hardware and operating system",
+        "Use recommended performance settings",
+        "\\u0041\\u0064\\u0064\\u0069\\u0074\\u0069\\u006f\\u006e\\u0061\\u006c \\u0063\\u006f\\u006e\\u0074\\u0065\\u006e\\u0074 \\u0070\\u0072\\u006f\\u0063\\u0065\\u0073\\u0073\\u0065\\u0073 \\u0063\\u0061\\u006e \\u0069\\u006d\\u0070\\u0072\\u006f\\u0076\\u0065 \\u0070\\u0065\\u0072\\u0066\\u006f\\u0072\\u006d\\u0061\\u006e\\u0063\\u0065 \\u0077\\u0068\\u0065\\u006e \\u0075\\u0073\\u0069\\u006e\\u0067 \\u006d\\u0075\\u006c\\u0074\\u0069\\u0070\\u006c\\u0065 \\u0074\\u0061\\u0062\\u0073\\u002c \\u0062\\u0075\\u0074 \\u0077\\u0069\\u006c\\u006c \\u0061\\u006c\\u0073\\u006f \\u0075\\u0073\\u0065 \\u006d\\u006f\\u0072\\u0065 \\u006d\\u0065\\u006d\\u006f\\u0072\\u0079\\u002e",
+        "Additional content processes can improve performance when using multiple tabs, but will also use more memory.",
+    ];
+    c.bench_function("unicode", move |b| {
+        b.iter(|| {
+            for s in strings {
+                unescape_unicode(s);
+            }
+        })
+    });
+}
+
+criterion_group!(benches, parser_bench, unicode_unescape_bench);
 criterion_main!(benches);
diff --git a/fluent-syntax/src/lib.rs b/fluent-syntax/src/lib.rs
@@ -1,2 +1,3 @@
 pub mod ast;
 pub mod parser;
+pub mod unicode;
diff --git a/fluent-syntax/src/unicode.rs b/fluent-syntax/src/unicode.rs
@@ -0,0 +1,50 @@
+use std::borrow::Cow;
+use std::char;
+
+fn encode_unicode(s: &str) -> char {
+    u32::from_str_radix(s, 16)
+        .ok()
+        .and_then(char::from_u32)
+        .unwrap_or('�')
+}
+
+pub fn unescape_unicode<'u>(input: &'u str) -> Cow<'u, str> {
+    let bytes = input.as_bytes();
+    let mut result = Cow::from(input);
+
+    let mut ptr = 0;
+
+    while let Some(b) = bytes.get(ptr) {
+        if b != &b'\\' {
+            if let Cow::Owned(ref mut s) = result {
+                s.push(*b as char);
+            }
+            ptr += 1;
+            continue;
+        }
+
+        if let Cow::Borrowed(_) = result {
+            result = Cow::from(&input[0..ptr]);
+        }
+
+        ptr += 1;
+
+        let new_char = match bytes.get(ptr) {
+            Some(b'\\') => '\\',
+            Some(b'"') => '"',
+            Some(u @ b'u') | Some(u @ b'U') => {
+                let start = ptr + 1;
+                let len = if u == &b'u' { 4 } else { 6 };
+                ptr += len;
+                input
+                    .get(start..(start + len))
+                    .map(|slice| encode_unicode(slice))
+                    .unwrap_or('�')
+            }
+            _ => '�',
+        };
+        result.to_mut().push(new_char);
+        ptr += 1;
+    }
+    result
+}
diff --git a/fluent-syntax/tests/ast/helper.rs b/fluent-syntax/tests/ast/helper.rs
diff --git a/fluent-syntax/tests/ast/mod.rs b/fluent-syntax/tests/ast/mod.rs
@@ -1,6 +1,5 @@
-mod helper;
-
 use fluent_syntax::ast;
+use fluent_syntax::unicode::unescape_unicode;
 use serde::ser::SerializeMap;
 use serde::ser::SerializeSeq;
 use serde::{Serialize, Serializer};
@@ -360,7 +359,7 @@ where
     let mut map = serializer.serialize_map(Some(3))?;
     map.serialize_entry("type", "StringLiteral")?;
     map.serialize_entry("raw", raw)?;
-    map.serialize_entry("value", &helper::unescape_unicode(&raw))?;
+    map.serialize_entry("value", &unescape_unicode(&raw))?;
     map.end()
 }
 

diff --git a/fluent-syntax/tests/unicode.rs b/fluent-syntax/tests/unicode.rs
@@ -0,0 +1,32 @@
+use fluent_syntax::unicode::unescape_unicode;
+use std::borrow::Cow;
+
+fn is_cow_borrowed<'a>(input: Cow<'a, str>) -> bool {
+    if let Cow::Borrowed(_) = input {
+        true
+    } else {
+        false
+    }
+}
+
+#[test]
+fn unescape_unicode_test() {
+    assert!(is_cow_borrowed(unescape_unicode("foo")));
+
+    assert_eq!(unescape_unicode("foo"), "foo");
+    assert_eq!(unescape_unicode("foo \\\\"), "foo \\");
+    assert_eq!(unescape_unicode("foo \\\""), "foo \"");
+    assert_eq!(unescape_unicode("foo \\\\ faa"), "foo \\ faa");
+    assert_eq!(
+        unescape_unicode("foo \\\\ faa \\\\ fii"),
+        "foo \\ faa \\ fii"
+    );
+    assert_eq!(
+        unescape_unicode("foo \\\\\\\" faa \\\"\\\\ fii"),
+        "foo \\\" faa \"\\ fii"
+    );
+    assert_eq!(unescape_unicode("\\u0041\\u004F"), "AO");
+    assert_eq!(unescape_unicode("\\uA"), "�");
+    assert_eq!(unescape_unicode("\\uA0Pl"), "�");
+    assert_eq!(unescape_unicode("\\d Foo"), "� Foo");
+}