diff --git a/fluent-syntax/src/errors.js b/fluent-syntax/src/errors.js index 9b8154fc5..028ee6a4a 100644 --- a/fluent-syntax/src/errors.js +++ b/fluent-syntax/src/errors.js @@ -69,8 +69,8 @@ function getErrorMessage(code, args) { return `Unknown escape sequence: \\${char}.`; } case "E0026": { - const [char] = args; - return `Invalid Unicode escape sequence: \\u${char}.`; + const [sequence] = args; + return `Invalid Unicode escape sequence: ${sequence}.`; } case "E0027": return "Unbalanced closing brace in TextElement."; diff --git a/fluent-syntax/src/parser.js b/fluent-syntax/src/parser.js index 5ad68c96f..2fcbd5b29 100644 --- a/fluent-syntax/src/parser.js +++ b/fluent-syntax/src/parser.js @@ -6,10 +6,6 @@ import { ParseError } from "./errors"; const trailingWSRe = /[ \t\n\r]+$/; -// The Fluent Syntax spec uses /.*/ to parse comment lines. It matches all -// characters except the following ones, which are considered line endings by -// the regex engine. -const COMMENT_EOL = ["\n", "\r", "\u2028", "\u2029"]; function withSpan(fn) { @@ -194,10 +190,10 @@ export default class FluentParser { level = i; } - if (!COMMENT_EOL.includes(ps.currentChar)) { + if (ps.currentChar !== EOL) { ps.expectChar(" "); let ch; - while ((ch = ps.takeChar(x => !COMMENT_EOL.includes(x)))) { + while ((ch = ps.takeChar(x => x !== EOL))) { content += ch; } } @@ -231,7 +227,7 @@ export default class FluentParser { ps.skipBlankInline(); ps.expectChar("="); - const value = this.maybeGetValue(ps, {allowVariantList: false}); + const value = this.maybeGetPattern(ps); const attrs = this.getAttributes(ps); if (value === null && attrs.length === 0) { @@ -248,11 +244,9 @@ export default class FluentParser { ps.skipBlankInline(); ps.expectChar("="); - // XXX Once https://github.com/projectfluent/fluent/pull/220 lands, - // getTerm will be the only place where VariantLists are still legal. Move - // the code from getPatternOrVariantList up to here then, and remove the - // allowVariantList switch. - const value = this.maybeGetValue(ps, {allowVariantList: true}); + // Syntax 0.8 compat: VariantLists are supported but deprecated. They can + // only be found as values of Terms. Nested VariantLists are not allowed. + const value = this.maybeGetVariantList(ps) || this.maybeGetPattern(ps); if (value === null) { throw new ParseError("E0006", id.name); } @@ -269,7 +263,7 @@ export default class FluentParser { ps.skipBlankInline(); ps.expectChar("="); - const value = this.maybeGetValue(ps, {allowVariantList: false}); + const value = this.maybeGetPattern(ps); if (value === null) { throw new ParseError("E0012"); } @@ -316,7 +310,7 @@ export default class FluentParser { return this.getIdentifier(ps); } - getVariant(ps, {hasDefault, allowVariantList}) { + getVariant(ps, {hasDefault}) { let defaultIndex = false; if (ps.currentChar === "*") { @@ -337,9 +331,7 @@ export default class FluentParser { ps.skipBlank(); ps.expectChar("]"); - // XXX We need to pass allowVariantList all the way down to here because - // nested VariantLists in Terms are legal for now. - const value = this.maybeGetValue(ps, {allowVariantList}); + const value = this.maybeGetPattern(ps); if (value === null) { throw new ParseError("E0012"); } @@ -347,13 +339,13 @@ export default class FluentParser { return new AST.Variant(key, value, defaultIndex); } - getVariants(ps, {allowVariantList}) { + getVariants(ps) { const variants = []; let hasDefault = false; ps.skipBlank(); while (ps.isVariantStart()) { - const variant = this.getVariant(ps, {allowVariantList, hasDefault}); + const variant = this.getVariant(ps, {hasDefault}); if (variant.default) { hasDefault = true; @@ -409,34 +401,34 @@ export default class FluentParser { return new AST.NumberLiteral(num); } - // maybeGetValue distinguishes between patterns which start on the same line + // maybeGetPattern distinguishes between patterns which start on the same line // as the identifier (a.k.a. inline signleline patterns and inline multiline // patterns) and patterns which start on a new line (a.k.a. block multiline // patterns). The distinction is important for the dedentation logic: the // indent of the first line of a block pattern must be taken into account when // calculating the maximum common indent. - maybeGetValue(ps, {allowVariantList}) { + maybeGetPattern(ps) { ps.peekBlankInline(); if (ps.isValueStart()) { ps.skipToPeek(); - return this.getPatternOrVariantList( - ps, {isBlock: false, allowVariantList}); + return this.getPattern(ps, {isBlock: false}); } ps.peekBlankBlock(); if (ps.isValueContinuation()) { ps.skipToPeek(); - return this.getPatternOrVariantList( - ps, {isBlock: true, allowVariantList}); + return this.getPattern(ps, {isBlock: true}); } return null; } - // Parse a VariantList (if allowed) or a Pattern. - getPatternOrVariantList(ps, {isBlock, allowVariantList}) { - ps.peekBlankInline(); - if (allowVariantList && ps.currentPeek === "{") { + // Deprecated in Syntax 0.8. VariantLists are only allowed as values of Terms. + // Values of Messages, Attributes and Variants must be Patterns. This method + // is only used in getTerm. + maybeGetVariantList(ps) { + ps.peekBlank(); + if (ps.currentPeek === "{") { const start = ps.peekOffset; ps.peek(); ps.peekBlankInline(); @@ -445,19 +437,18 @@ export default class FluentParser { if (ps.isVariantStart()) { ps.resetPeek(start); ps.skipToPeek(); - return this.getVariantList(ps, {allowVariantList}); + return this.getVariantList(ps); } } } ps.resetPeek(); - const pattern = this.getPattern(ps, {isBlock}); - return pattern; + return null; } getVariantList(ps) { ps.expectChar("{"); - var variants = this.getVariants(ps, {allowVariantList: true}); + var variants = this.getVariants(ps); ps.expectChar("}"); return new AST.VariantList(variants); } @@ -599,37 +590,44 @@ export default class FluentParser { getEscapeSequence(ps) { const next = ps.currentChar; - if (next === "\\" || next === "\"") { - ps.next(); - return [`\\${next}`, next]; + switch (next) { + case "\\": + case "\"": + ps.next(); + return [`\\${next}`, next]; + case "u": + return this.getUnicodeEscapeSequence(ps, next, 4); + case "U": + return this.getUnicodeEscapeSequence(ps, next, 6); + default: + throw new ParseError("E0025", next); } + } - if (next === "u") { - let sequence = ""; - ps.next(); - - for (let i = 0; i < 4; i++) { - const ch = ps.takeHexDigit(); + getUnicodeEscapeSequence(ps, u, digits) { + ps.expectChar(u); - if (!ch) { - throw new ParseError("E0026", sequence + ps.currentChar); - } + let sequence = ""; + for (let i = 0; i < digits; i++) { + const ch = ps.takeHexDigit(); - sequence += ch; + if (!ch) { + throw new ParseError( + "E0026", `\\${u}${sequence}${ps.currentChar}`); } - const codepoint = parseInt(sequence, 16); - const unescaped = codepoint <= 0xD7FF || 0xE000 <= codepoint - // It's a Unicode scalar value. - ? String.fromCodePoint(codepoint) - // Escape sequences reresenting surrogate code points are well-formed - // but invalid in Fluent. Replace them with U+FFFD REPLACEMENT - // CHARACTER. - : "�"; - return [`\\u${sequence}`, unescaped]; + sequence += ch; } - throw new ParseError("E0025", next); + const codepoint = parseInt(sequence, 16); + const unescaped = codepoint <= 0xD7FF || 0xE000 <= codepoint + // It's a Unicode scalar value. + ? String.fromCodePoint(codepoint) + // Escape sequences reresenting surrogate code points are well-formed + // but invalid in Fluent. Replace them with U+FFFD REPLACEMENT + // CHARACTER. + : "�"; + return [`\\${u}${sequence}`, unescaped]; } getPlaceable(ps) { @@ -676,7 +674,7 @@ export default class FluentParser { ps.skipBlankInline(); ps.expectLineEnd(); - const variants = this.getVariants(ps, {allowVariantList: false}); + const variants = this.getVariants(ps); return new AST.SelectExpression(selector, variants); } diff --git a/fluent-syntax/test/fixtures_behavior/escape_sequences.ftl b/fluent-syntax/test/fixtures_behavior/escape_sequences.ftl index 801923be2..19ad1329a 100644 --- a/fluent-syntax/test/fixtures_behavior/escape_sequences.ftl +++ b/fluent-syntax/test/fixtures_behavior/escape_sequences.ftl @@ -12,5 +12,5 @@ key08 = {"Escaped \u0041 A"} # ~ERROR E0025, pos 232, args "A" key09 = {"\A"} -# ~ERROR E0026, pos 252, args "000z" +# ~ERROR E0026, pos 252, args "\u000z" key10 = {"\u000z"} diff --git a/fluent-syntax/test/fixtures_behavior/variant_lists.ftl b/fluent-syntax/test/fixtures_behavior/variant_lists.ftl index 2f3cc5b8d..21ffb8f1d 100644 --- a/fluent-syntax/test/fixtures_behavior/variant_lists.ftl +++ b/fluent-syntax/test/fixtures_behavior/variant_lists.ftl @@ -17,6 +17,7 @@ message2 = *[one] One } +# ~ERROR E0014, pos 211 -term2 = { *[one] { diff --git a/fluent-syntax/test/fixtures_reference/cr.json b/fluent-syntax/test/fixtures_reference/cr.json index afec8119a..44eab75f3 100644 --- a/fluent-syntax/test/fixtures_reference/cr.json +++ b/fluent-syntax/test/fixtures_reference/cr.json @@ -2,9 +2,8 @@ "type": "Resource", "body": [ { - "type": "Junk", - "annotations": [], - "content": "### This entire file uses CR as EOL.\r\rerr01 = Value 01\rerr02 = Value 02\r\rerr03 =\r\r Value 03\r Continued\r\r .title = Title\r\rerr04 = { \"str\r\rerr05 = { $sel -> }\r" + "type": "ResourceComment", + "content": "This entire file uses CR as EOL.\r\rerr01 = Value 01\rerr02 = Value 02\r\rerr03 =\r\r Value 03\r Continued\r\r .title = Title\r\rerr04 = { \"str\r\rerr05 = { $sel -> }\r" } ] } diff --git a/fluent-syntax/test/fixtures_reference/escaped_characters.ftl b/fluent-syntax/test/fixtures_reference/escaped_characters.ftl index 5242a4bcb..ec8623202 100644 --- a/fluent-syntax/test/fixtures_reference/escaped_characters.ftl +++ b/fluent-syntax/test/fixtures_reference/escaped_characters.ftl @@ -14,8 +14,20 @@ mismatched-quote = {"\\""} unknown-escape = {"\x"} ## Unicode escapes -string-unicode-sequence = {"\u0041"} -string-escaped-unicode = {"\\u0041"} +string-unicode-4digits = {"\u0041"} +escape-unicode-4digits = {"\\u0041"} +string-unicode-6digits = {"\U01F602"} +escape-unicode-6digits = {"\\U01F602"} + +# OK The trailing "00" is part of the literal value. +string-too-many-4digits = {"\u004100"} +# OK The trailing "00" is part of the literal value. +string-too-many-6digits = {"\U01F60200"} + +# ERROR Too few hex digits after \u. +string-too-few-4digits = {"\u41"} +# ERROR Too few hex digits after \U. +string-too-few-6digits = {"\U1F602"} ## Literal braces brace-open = An opening {"{"} brace. diff --git a/fluent-syntax/test/fixtures_reference/escaped_characters.json b/fluent-syntax/test/fixtures_reference/escaped_characters.json index e05c0efe6..a3220996a 100644 --- a/fluent-syntax/test/fixtures_reference/escaped_characters.json +++ b/fluent-syntax/test/fixtures_reference/escaped_characters.json @@ -179,7 +179,7 @@ "type": "Message", "id": { "type": "Identifier", - "name": "string-unicode-sequence" + "name": "string-unicode-4digits" }, "value": { "type": "Pattern", @@ -201,7 +201,7 @@ "type": "Message", "id": { "type": "Identifier", - "name": "string-escaped-unicode" + "name": "escape-unicode-4digits" }, "value": { "type": "Pattern", @@ -219,6 +219,118 @@ "attributes": [], "comment": null }, + { + "type": "Message", + "id": { + "type": "Identifier", + "name": "string-unicode-6digits" + }, + "value": { + "type": "Pattern", + "elements": [ + { + "type": "Placeable", + "expression": { + "type": "StringLiteral", + "raw": "\\U01F602", + "value": "😂" + } + } + ] + }, + "attributes": [], + "comment": null + }, + { + "type": "Message", + "id": { + "type": "Identifier", + "name": "escape-unicode-6digits" + }, + "value": { + "type": "Pattern", + "elements": [ + { + "type": "Placeable", + "expression": { + "type": "StringLiteral", + "raw": "\\\\U01F602", + "value": "\\U01F602" + } + } + ] + }, + "attributes": [], + "comment": null + }, + { + "type": "Message", + "id": { + "type": "Identifier", + "name": "string-too-many-4digits" + }, + "value": { + "type": "Pattern", + "elements": [ + { + "type": "Placeable", + "expression": { + "type": "StringLiteral", + "raw": "\\u004100", + "value": "A00" + } + } + ] + }, + "attributes": [], + "comment": { + "type": "Comment", + "content": "OK The trailing \"00\" is part of the literal value." + } + }, + { + "type": "Message", + "id": { + "type": "Identifier", + "name": "string-too-many-6digits" + }, + "value": { + "type": "Pattern", + "elements": [ + { + "type": "Placeable", + "expression": { + "type": "StringLiteral", + "raw": "\\U01F60200", + "value": "😂00" + } + } + ] + }, + "attributes": [], + "comment": { + "type": "Comment", + "content": "OK The trailing \"00\" is part of the literal value." + } + }, + { + "type": "Comment", + "content": "ERROR Too few hex digits after \\u." + }, + { + "type": "Junk", + "annotations": [], + "content": "string-too-few-4digits = {\"\\u41\"}\n" + }, + { + "type": "Comment", + "content": "ERROR Too few hex digits after \\U." + }, + { + "type": "Junk", + "annotations": [], + "content": "string-too-few-6digits = {\"\\U1F602\"}\n\n" + }, { "type": "GroupComment", "content": "Literal braces" diff --git a/fluent-syntax/test/fixtures_reference/select_expressions.ftl b/fluent-syntax/test/fixtures_reference/select_expressions.ftl index ac888262b..7a1fb820a 100644 --- a/fluent-syntax/test/fixtures_reference/select_expressions.ftl +++ b/fluent-syntax/test/fixtures_reference/select_expressions.ftl @@ -39,7 +39,7 @@ nested-select = } } -# ERROR VariantLists cannot appear in SelectExpressions +# ERROR VariantLists cannot be Variant values. nested-variant-list = { 1 -> *[one] { diff --git a/fluent-syntax/test/fixtures_reference/select_expressions.json b/fluent-syntax/test/fixtures_reference/select_expressions.json index 4e84c0c2a..a3dc5730a 100644 --- a/fluent-syntax/test/fixtures_reference/select_expressions.json +++ b/fluent-syntax/test/fixtures_reference/select_expressions.json @@ -274,7 +274,7 @@ }, { "type": "Comment", - "content": "ERROR VariantLists cannot appear in SelectExpressions" + "content": "ERROR VariantLists cannot be Variant values." }, { "type": "Junk", diff --git a/fluent-syntax/test/fixtures_reference/variant_lists.ftl b/fluent-syntax/test/fixtures_reference/variant_lists.ftl index d9031d91a..e5c61dd80 100644 --- a/fluent-syntax/test/fixtures_reference/variant_lists.ftl +++ b/fluent-syntax/test/fixtures_reference/variant_lists.ftl @@ -23,6 +23,7 @@ variant-list-in-message-attr = Value *[key] Value } +# ERROR VariantLists cannot be Variant values. -nested-variant-list-in-term = { *[one] { @@ -37,7 +38,7 @@ variant-list-in-message-attr = Value } } -# ERROR VariantLists may not appear in SelectExpressions +# ERROR VariantLists cannot be Variant values. nested-select-then-variant-list = { *[one] { 2 -> diff --git a/fluent-syntax/test/fixtures_reference/variant_lists.json b/fluent-syntax/test/fixtures_reference/variant_lists.json index aaac9f7b0..83f166621 100644 --- a/fluent-syntax/test/fixtures_reference/variant_lists.json +++ b/fluent-syntax/test/fixtures_reference/variant_lists.json @@ -94,48 +94,13 @@ "content": " .attr =\n {\n *[key] Value\n }\n\n" }, { - "type": "Term", - "id": { - "type": "Identifier", - "name": "nested-variant-list-in-term" - }, - "value": { - "type": "VariantList", - "variants": [ - { - "type": "Variant", - "key": { - "type": "Identifier", - "name": "one" - }, - "value": { - "type": "VariantList", - "variants": [ - { - "type": "Variant", - "key": { - "type": "Identifier", - "name": "two" - }, - "value": { - "type": "Pattern", - "elements": [ - { - "type": "TextElement", - "value": "Value" - } - ] - }, - "default": true - } - ] - }, - "default": true - } - ] - }, - "attributes": [], - "comment": null + "type": "Comment", + "content": "ERROR VariantLists cannot be Variant values." + }, + { + "type": "Junk", + "annotations": [], + "content": "-nested-variant-list-in-term =\n {\n *[one] {\n *[two] Value\n }\n }\n\n" }, { "type": "Term", @@ -195,7 +160,7 @@ }, { "type": "Comment", - "content": "ERROR VariantLists may not appear in SelectExpressions" + "content": "ERROR VariantLists cannot be Variant values." }, { "type": "Junk", diff --git a/fluent/src/resource.js b/fluent/src/resource.js index cc9fe1d7b..ab55cb3a1 100644 --- a/fluent/src/resource.js +++ b/fluent/src/resource.js @@ -23,8 +23,8 @@ const RE_TEXT_RUN = /([^{}\n\r]+)/y; const RE_STRING_RUN = /([^\\"\n\r]*)/y; // Escape sequences. -const RE_UNICODE_ESCAPE = /\\u([a-fA-F0-9]{4})/y; const RE_STRING_ESCAPE = /\\([\\"])/y; +const RE_UNICODE_ESCAPE = /\\u([a-fA-F0-9]{4})|\\U([a-fA-F0-9]{6})/y; // Used for trimming TextElements and indents. const RE_LEADING_NEWLINES = /^\n+/; @@ -425,9 +425,13 @@ export default class FluentResource extends Map { // Unescape known escape sequences. function parseEscapeSequence() { + if (test(RE_STRING_ESCAPE)) { + return match1(RE_STRING_ESCAPE); + } + if (test(RE_UNICODE_ESCAPE)) { - let sequence = match1(RE_UNICODE_ESCAPE); - let codepoint = parseInt(sequence, 16); + let [, codepoint4, codepoint6] = match(RE_UNICODE_ESCAPE); + let codepoint = parseInt(codepoint4 || codepoint6, 16); return codepoint <= 0xD7FF || 0xE000 <= codepoint // It's a Unicode scalar value. ? String.fromCodePoint(codepoint) @@ -436,10 +440,6 @@ export default class FluentResource extends Map { : "�"; } - if (test(RE_STRING_ESCAPE)) { - return match1(RE_STRING_ESCAPE); - } - throw new FluentError("Unknown escape sequence"); } diff --git a/fluent/test/fixtures_reference/escaped_characters.json b/fluent/test/fixtures_reference/escaped_characters.json index b5046fdb7..374a093c7 100644 --- a/fluent/test/fixtures_reference/escaped_characters.json +++ b/fluent/test/fixtures_reference/escaped_characters.json @@ -18,12 +18,24 @@ "backslash-in-string": [ "\\" ], - "string-unicode-sequence": [ + "string-unicode-4digits": [ "A" ], - "string-escaped-unicode": [ + "escape-unicode-4digits": [ "\\u0041" ], + "string-unicode-6digits": [ + "😂" + ], + "escape-unicode-6digits": [ + "\\U01F602" + ], + "string-too-many-4digits": [ + "A00" + ], + "string-too-many-6digits": [ + "😂00" + ], "brace-open": [ "An opening ", "{",