Skip to content

Commit

Permalink
Add unicode escaping in resolver (#84)
Browse files Browse the repository at this point in the history
* Add unicode escaping in resolver

* Switch to Cow
  • Loading branch information
zbraniecki committed Feb 1, 2019
1 parent 8067752 commit ff5b216
Show file tree
Hide file tree
Showing 9 changed files with 126 additions and 74 deletions.
2 changes: 1 addition & 1 deletion fluent-bundle/benches/resolver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ fn get_ids(res: &FluentResource) -> Vec<String> {
}

fn resolver_bench(c: &mut Criterion) {
let tests = &["simple", "menubar"];
let tests = &["simple", "menubar", "unescape"];
let ftl_strings = get_strings(tests);

c.bench_function_over_inputs(
Expand Down
9 changes: 9 additions & 0 deletions fluent-bundle/benches/unescape.ftl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
face-with-tears-of-joy = 😂
tetragram-for-centre = 𝌆
surrogates-in-text = \uD83D\uDE02
surrogates-in-string = {"\uD83D\uDE02"}
surrogates-in-adjacent-strings = {"\uD83D"}{"\uDE02"}
emoji-in-text = A face 😂 with tears of joy.
emoji-in-string = {"A face 😂 with tears of joy."}
4 changes: 2 additions & 2 deletions fluent-bundle/src/resolve.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use super::bundle::FluentBundle;
use super::entry::GetEntry;
use super::types::FluentValue;
use fluent_syntax::ast;
use fluent_syntax::unicode::unescape_unicode;

#[derive(Debug, PartialEq)]
pub enum ResolverError {
Expand Down Expand Up @@ -176,8 +177,7 @@ impl<'source> ResolveValue for ast::InlineExpression<'source> {
fn to_value(&self, env: &Env) -> Result<FluentValue, ResolverError> {
match self {
ast::InlineExpression::StringLiteral { raw } => {
// XXX: We need to decode the raw into unicode here.
Ok(FluentValue::from(*raw))
Ok(FluentValue::from(unescape_unicode(raw).into_owned()))
}
ast::InlineExpression::NumberLiteral { value } => {
Ok(FluentValue::as_number(*value).unwrap())
Expand Down
30 changes: 29 additions & 1 deletion fluent-syntax/benches/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use std::io;
use std::io::Read;

use fluent_syntax::parser::parse;
use fluent_syntax::unicode::unescape_unicode;

fn read_file(path: &str) -> Result<String, io::Error> {
let mut f = File::open(path)?;
Expand Down Expand Up @@ -38,5 +39,32 @@ fn parser_bench(c: &mut Criterion) {
);
}

criterion_group!(benches, parser_bench);
fn unicode_unescape_bench(c: &mut Criterion) {
let strings = &[
"foo",
"This is an example value",
"Hello \\u00e3\\u00e9 World",
"\\u004c\\u006f\\u0072\\u0065\\u006d \\u0069\\u0070\\u0073\\u0075\\u006d \\u0064\\u006f\\u006c\\u006f\\u0072 \\u0073\\u0069\\u0074 \\u0061\\u006d\\u0065\\u0074",
"Let me introduce \\\"The\\\" Fluent",
"And here's an example of \\\\ a character to be escaped",
"But this message is completely unescape free",
"And so is this one",
"Maybe this one is as well completely escape free",
"Welcome to Mozilla Firefox",
"\\u0054\\u0068\\u0065\\u0073\\u0065 \\u0073\\u0065\\u0074\\u0074\\u0069\\u006e\\u0067\\u0073 \\u0061\\u0072\\u0065 \\u0074\\u0061\\u0069\\u006c\\u006f\\u0072\\u0065\\u0064 \\u0074\\u006f \\u0079\\u006f\\u0075\\u0072 \\u0063\\u006f\\u006d\\u0070\\u0075\\u0074\\u0065\\u0072\\u2019\\u0073 \\u0068\\u0061\\u0072\\u0064\\u0077\\u0061\\u0072\\u0065 \\u0061\\u006e\\u0064 \\u006f\\u0070\\u0065\\u0072\\u0061\\u0074\\u0069\\u006e\\u0067 \\u0073\\u0079\\u0073\\u0074\\u0065\\u006d\\u002e",
"These settings are tailored to your computer’s hardware and operating system",
"Use recommended performance settings",
"\\u0041\\u0064\\u0064\\u0069\\u0074\\u0069\\u006f\\u006e\\u0061\\u006c \\u0063\\u006f\\u006e\\u0074\\u0065\\u006e\\u0074 \\u0070\\u0072\\u006f\\u0063\\u0065\\u0073\\u0073\\u0065\\u0073 \\u0063\\u0061\\u006e \\u0069\\u006d\\u0070\\u0072\\u006f\\u0076\\u0065 \\u0070\\u0065\\u0072\\u0066\\u006f\\u0072\\u006d\\u0061\\u006e\\u0063\\u0065 \\u0077\\u0068\\u0065\\u006e \\u0075\\u0073\\u0069\\u006e\\u0067 \\u006d\\u0075\\u006c\\u0074\\u0069\\u0070\\u006c\\u0065 \\u0074\\u0061\\u0062\\u0073\\u002c \\u0062\\u0075\\u0074 \\u0077\\u0069\\u006c\\u006c \\u0061\\u006c\\u0073\\u006f \\u0075\\u0073\\u0065 \\u006d\\u006f\\u0072\\u0065 \\u006d\\u0065\\u006d\\u006f\\u0072\\u0079\\u002e",
"Additional content processes can improve performance when using multiple tabs, but will also use more memory.",
];
c.bench_function("unicode", move |b| {
b.iter(|| {
for s in strings {
unescape_unicode(s);
}
})
});
}

criterion_group!(benches, parser_bench, unicode_unescape_bench);
criterion_main!(benches);
1 change: 1 addition & 0 deletions fluent-syntax/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pub mod ast;
pub mod parser;
pub mod unicode;
50 changes: 50 additions & 0 deletions fluent-syntax/src/unicode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
use std::borrow::Cow;
use std::char;

fn encode_unicode(s: &str) -> char {
u32::from_str_radix(s, 16)
.ok()
.and_then(char::from_u32)
.unwrap_or('�')
}

pub fn unescape_unicode<'u>(input: &'u str) -> Cow<'u, str> {
let bytes = input.as_bytes();
let mut result = Cow::from(input);

let mut ptr = 0;

while let Some(b) = bytes.get(ptr) {
if b != &b'\\' {
if let Cow::Owned(ref mut s) = result {
s.push(*b as char);
}
ptr += 1;
continue;
}

if let Cow::Borrowed(_) = result {
result = Cow::from(&input[0..ptr]);
}

ptr += 1;

let new_char = match bytes.get(ptr) {
Some(b'\\') => '\\',
Some(b'"') => '"',
Some(u @ b'u') | Some(u @ b'U') => {
let start = ptr + 1;
let len = if u == &b'u' { 4 } else { 6 };
ptr += len;
input
.get(start..(start + len))
.map(|slice| encode_unicode(slice))
.unwrap_or('�')
}
_ => '�',
};
result.to_mut().push(new_char);
ptr += 1;
}
result
}
67 changes: 0 additions & 67 deletions fluent-syntax/tests/ast/helper.rs

This file was deleted.

5 changes: 2 additions & 3 deletions fluent-syntax/tests/ast/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
mod helper;

use fluent_syntax::ast;
use fluent_syntax::unicode::unescape_unicode;
use serde::ser::SerializeMap;
use serde::ser::SerializeSeq;
use serde::{Serialize, Serializer};
Expand Down Expand Up @@ -360,7 +359,7 @@ where
let mut map = serializer.serialize_map(Some(3))?;
map.serialize_entry("type", "StringLiteral")?;
map.serialize_entry("raw", raw)?;
map.serialize_entry("value", &helper::unescape_unicode(&raw))?;
map.serialize_entry("value", &unescape_unicode(&raw))?;
map.end()
}

Expand Down
32 changes: 32 additions & 0 deletions fluent-syntax/tests/unicode.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
use fluent_syntax::unicode::unescape_unicode;
use std::borrow::Cow;

fn is_cow_borrowed<'a>(input: Cow<'a, str>) -> bool {
if let Cow::Borrowed(_) = input {
true
} else {
false
}
}

#[test]
fn unescape_unicode_test() {
assert!(is_cow_borrowed(unescape_unicode("foo")));

assert_eq!(unescape_unicode("foo"), "foo");
assert_eq!(unescape_unicode("foo \\\\"), "foo \\");
assert_eq!(unescape_unicode("foo \\\""), "foo \"");
assert_eq!(unescape_unicode("foo \\\\ faa"), "foo \\ faa");
assert_eq!(
unescape_unicode("foo \\\\ faa \\\\ fii"),
"foo \\ faa \\ fii"
);
assert_eq!(
unescape_unicode("foo \\\\\\\" faa \\\"\\\\ fii"),
"foo \\\" faa \"\\ fii"
);
assert_eq!(unescape_unicode("\\u0041\\u004F"), "AO");
assert_eq!(unescape_unicode("\\uA"), "�");
assert_eq!(unescape_unicode("\\uA0Pl"), "�");
assert_eq!(unescape_unicode("\\d Foo"), "� Foo");
}

0 comments on commit ff5b216

Please sign in to comment.