Permalink
Browse files

unicode-base: add \u{n..} for matching astral chars; promote to RC

  • Loading branch information...
1 parent e542374 commit d94c44f53c8217337e0a57ad529ad99108a15279 @slevithan committed Apr 3, 2012
Showing with 107 additions and 54 deletions.
  1. +3 −0 README.md
  2. +52 −27 src/addons/unicode/unicode-base.js
  3. +52 −27 xregexp-all.js
View
@@ -87,6 +87,9 @@ var unicodeWord = XRegExp('^\\p{L}+$');
unicodeWord.test('Русский'); // -> true
unicodeWord.test('日本語'); // -> true
unicodeWord.test('العربية'); // -> true
+
+// Match characters beyond the Basic Multilingual Plane
+XRegExp('\\u{29DF6}').test('������'); // -> true
~~~
The base script adds `\p{Letter}` and its alias `\p{L}`, but other Unicode categories, scripts, blocks, and properties require addon packages. Try these next examples after additionally including `unicode-scripts.js`:
@@ -1,5 +1,5 @@
/*!
- * XRegExp Unicode Base v1.0.0-dev, 2012-03-31
+ * XRegExp Unicode Base v1.0.0-rc, 2012-04-03
* (c) 2008-2012 Steven Levithan <http://xregexp.com/>
* MIT License
* Uses Unicode 6.1 <http://unicode.org/>
@@ -32,7 +32,7 @@
}
// Adds leading zeros if shorter than four characters
- function pad(str) {
+ function pad4(str) {
while (str.length < 4) {
str = "0" + str;
}
@@ -57,15 +57,15 @@
XRegExp.forEach(range, /\\u(\w{4})(?:-\\u(\w{4}))?/, function (m) {
start = dec(m[1]);
if (start > (lastEnd + 1)) {
- output.push("\\u" + pad(hex(lastEnd + 1)));
+ output.push("\\u" + pad4(hex(lastEnd + 1)));
if (start > (lastEnd + 2)) {
- output.push("-\\u" + pad(hex(start - 1)));
+ output.push("-\\u" + pad4(hex(start - 1)));
}
}
lastEnd = dec(m[2] || m[1]);
});
if (lastEnd < 65535) {
- output.push("\\u" + pad(hex(lastEnd + 1)));
+ output.push("\\u" + pad4(hex(lastEnd + 1)));
if (lastEnd < 65534) {
output.push("-\\uFFFF");
}
@@ -84,28 +84,8 @@
XRegExp.install("extensibility");
-// Adds Unicode token syntax to XRegExp: \p{..} or \P{..}
- XRegExp.addToken(
- /\\([pP]){(\^?)([^}]*)}/,
- function (match, scope) {
- var inv = (match[1] === "P" || match[2]) ? "^" : "",
- item = slug(match[3]);
- // \p{..}, \P{..}, and \p{^..} are valid, but the double negative \P{^..} isn't
- if (match[1] === "P" && match[2]) {
- throw new SyntaxError("erroneous characters: " + match[0]);
- }
- if (!unicode.hasOwnProperty(item)) {
- throw new SyntaxError("invalid or unsupported Unicode item: " + match[0]);
- }
- return scope === "class" ?
- (inv ? cacheInversion(item) : unicode[item]) :
- "[" + inv + unicode[item] + "]";
- },
- {scope: "all"}
- );
-
/**
- * Adds to the list of XRegExp Unicode tokens that can be used via \p{..} or \P{..}.
+ * Adds to the list of Unicode properties that XRegExp regexes can match via \p{..} or \P{..}.
* @memberOf XRegExp
* @param {Object} pack Named sets of Unicode code points and ranges.
* @param {Object} [aliases] Aliases for the primary token names.
@@ -138,12 +118,57 @@
}
};
-// Adds Unicode Letter category data (use addon packages for other categories, scripts, and blocks)
+/* Adds data for the Unicode `Letter` category. Addon packages include other categories, scripts,
+ * blocks, and properties.
+ */
XRegExp.addUnicodePackage({
L: "0041-005A0061-007A00AA00B500BA00C0-00D600D8-00F600F8-02C102C6-02D102E0-02E402EC02EE0370-037403760377037A-037D03860388-038A038C038E-03A103A3-03F503F7-0481048A-05270531-055605590561-058705D0-05EA05F0-05F20620-064A066E066F0671-06D306D506E506E606EE06EF06FA-06FC06FF07100712-072F074D-07A507B107CA-07EA07F407F507FA0800-0815081A082408280840-085808A008A2-08AC0904-0939093D09500958-09610971-09770979-097F0985-098C098F09900993-09A809AA-09B009B209B6-09B909BD09CE09DC09DD09DF-09E109F009F10A05-0A0A0A0F0A100A13-0A280A2A-0A300A320A330A350A360A380A390A59-0A5C0A5E0A72-0A740A85-0A8D0A8F-0A910A93-0AA80AAA-0AB00AB20AB30AB5-0AB90ABD0AD00AE00AE10B05-0B0C0B0F0B100B13-0B280B2A-0B300B320B330B35-0B390B3D0B5C0B5D0B5F-0B610B710B830B85-0B8A0B8E-0B900B92-0B950B990B9A0B9C0B9E0B9F0BA30BA40BA8-0BAA0BAE-0BB90BD00C05-0C0C0C0E-0C100C12-0C280C2A-0C330C35-0C390C3D0C580C590C600C610C85-0C8C0C8E-0C900C92-0CA80CAA-0CB30CB5-0CB90CBD0CDE0CE00CE10CF10CF20D05-0D0C0D0E-0D100D12-0D3A0D3D0D4E0D600D610D7A-0D7F0D85-0D960D9A-0DB10DB3-0DBB0DBD0DC0-0DC60E01-0E300E320E330E40-0E460E810E820E840E870E880E8A0E8D0E94-0E970E99-0E9F0EA1-0EA30EA50EA70EAA0EAB0EAD-0EB00EB20EB30EBD0EC0-0EC40EC60EDC-0EDF0F000F40-0F470F49-0F6C0F88-0F8C1000-102A103F1050-1055105A-105D106110651066106E-10701075-1081108E10A0-10C510C710CD10D0-10FA10FC-1248124A-124D1250-12561258125A-125D1260-1288128A-128D1290-12B012B2-12B512B8-12BE12C012C2-12C512C8-12D612D8-13101312-13151318-135A1380-138F13A0-13F41401-166C166F-167F1681-169A16A0-16EA1700-170C170E-17111720-17311740-17511760-176C176E-17701780-17B317D717DC1820-18771880-18A818AA18B0-18F51900-191C1950-196D1970-19741980-19AB19C1-19C71A00-1A161A20-1A541AA71B05-1B331B45-1B4B1B83-1BA01BAE1BAF1BBA-1BE51C00-1C231C4D-1C4F1C5A-1C7D1CE9-1CEC1CEE-1CF11CF51CF61D00-1DBF1E00-1F151F18-1F1D1F20-1F451F48-1F4D1F50-1F571F591F5B1F5D1F5F-1F7D1F80-1FB41FB6-1FBC1FBE1FC2-1FC41FC6-1FCC1FD0-1FD31FD6-1FDB1FE0-1FEC1FF2-1FF41FF6-1FFC2071207F2090-209C21022107210A-211321152119-211D212421262128212A-212D212F-2139213C-213F2145-2149214E218321842C00-2C2E2C30-2C5E2C60-2CE42CEB-2CEE2CF22CF32D00-2D252D272D2D2D30-2D672D6F2D80-2D962DA0-2DA62DA8-2DAE2DB0-2DB62DB8-2DBE2DC0-2DC62DC8-2DCE2DD0-2DD62DD8-2DDE2E2F300530063031-3035303B303C3041-3096309D-309F30A1-30FA30FC-30FF3105-312D3131-318E31A0-31BA31F0-31FF3400-4DB54E00-9FCCA000-A48CA4D0-A4FDA500-A60CA610-A61FA62AA62BA640-A66EA67F-A697A6A0-A6E5A717-A71FA722-A788A78B-A78EA790-A793A7A0-A7AAA7F8-A801A803-A805A807-A80AA80C-A822A840-A873A882-A8B3A8F2-A8F7A8FBA90A-A925A930-A946A960-A97CA984-A9B2A9CFAA00-AA28AA40-AA42AA44-AA4BAA60-AA76AA7AAA80-AAAFAAB1AAB5AAB6AAB9-AABDAAC0AAC2AADB-AADDAAE0-AAEAAAF2-AAF4AB01-AB06AB09-AB0EAB11-AB16AB20-AB26AB28-AB2EABC0-ABE2AC00-D7A3D7B0-D7C6D7CB-D7FBF900-FA6DFA70-FAD9FB00-FB06FB13-FB17FB1DFB1F-FB28FB2A-FB36FB38-FB3CFB3EFB40FB41FB43FB44FB46-FBB1FBD3-FD3DFD50-FD8FFD92-FDC7FDF0-FDFBFE70-FE74FE76-FEFCFF21-FF3AFF41-FF5AFF66-FFBEFFC2-FFC7FFCA-FFCFFFD2-FFD7FFDA-FFDC"
}, {
L: "Letter"
});
+/* Adds Unicode property syntax to XRegExp: \p{..}, \P{..}, \p{^..}
+ */
+ XRegExp.addToken(
+ /\\([pP]){(\^?)([^}]*)}/,
+ function (match, scope) {
+ var inv = (match[1] === "P" || match[2]) ? "^" : "",
+ item = slug(match[3]);
+ // The double negative \P{^..} is invalid
+ if (match[1] === "P" && match[2]) {
+ throw new SyntaxError("erroneous characters: " + match[0]);
+ }
+ if (!unicode.hasOwnProperty(item)) {
+ throw new SyntaxError("invalid or unsupported Unicode property: " + match[0]);
+ }
+ return scope === "class" ?
+ (inv ? cacheInversion(item) : unicode[item]) :
+ "[" + inv + unicode[item] + "]";
+ },
+ {scope: "all"}
+ );
+
+/* Adds Unicode code point syntax to XRegExp: \u{n..}
+ * `n..` is any 1-6 digit 21-bit hexadecimal code point from 0-10FFFF. Code points above FFFF are
+ * converted to surrogate pairs.
+ */
+ XRegExp.addToken(
+ /\\u{([0-9A-Fa-f]{1,6})}/,
+ function (match) {
+ var code = dec(match[1]), offset;
+ if (code > 0x10FFFF) {
+ throw new RangeError("invalid Unicode code point: " + match[0]);
+ }
+ // Converting to \uNNNN when possible avoids needing to escape the character and keep
+ // it separate from preceding tokens
+ if (code <= 0xFFFF) {
+ return "\\u" + pad4(hex(code));
+ }
+ offset = code - 0x10000;
+ return String.fromCharCode(0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)); // Surrogate pair
+ },
+ {scope: "all"}
+ );
+
}(XRegExp));
View
@@ -1195,7 +1195,7 @@ XRegExp = XRegExp || (function (undef) {
/***** unicode-base.js *****/
/*!
- * XRegExp Unicode Base v1.0.0-dev, 2012-03-31
+ * XRegExp Unicode Base v1.0.0-rc, 2012-04-03
* (c) 2008-2012 Steven Levithan <http://xregexp.com/>
* MIT License
* Uses Unicode 6.1 <http://unicode.org/>
@@ -1228,7 +1228,7 @@ XRegExp = XRegExp || (function (undef) {
}
// Adds leading zeros if shorter than four characters
- function pad(str) {
+ function pad4(str) {
while (str.length < 4) {
str = "0" + str;
}
@@ -1253,15 +1253,15 @@ XRegExp = XRegExp || (function (undef) {
XRegExp.forEach(range, /\\u(\w{4})(?:-\\u(\w{4}))?/, function (m) {
start = dec(m[1]);
if (start > (lastEnd + 1)) {
- output.push("\\u" + pad(hex(lastEnd + 1)));
+ output.push("\\u" + pad4(hex(lastEnd + 1)));
if (start > (lastEnd + 2)) {
- output.push("-\\u" + pad(hex(start - 1)));
+ output.push("-\\u" + pad4(hex(start - 1)));
}
}
lastEnd = dec(m[2] || m[1]);
});
if (lastEnd < 65535) {
- output.push("\\u" + pad(hex(lastEnd + 1)));
+ output.push("\\u" + pad4(hex(lastEnd + 1)));
if (lastEnd < 65534) {
output.push("-\\uFFFF");
}
@@ -1280,28 +1280,8 @@ XRegExp = XRegExp || (function (undef) {
XRegExp.install("extensibility");
-// Adds Unicode token syntax to XRegExp: \p{..} or \P{..}
- XRegExp.addToken(
- /\\([pP]){(\^?)([^}]*)}/,
- function (match, scope) {
- var inv = (match[1] === "P" || match[2]) ? "^" : "",
- item = slug(match[3]);
- // \p{..}, \P{..}, and \p{^..} are valid, but the double negative \P{^..} isn't
- if (match[1] === "P" && match[2]) {
- throw new SyntaxError("erroneous characters: " + match[0]);
- }
- if (!unicode.hasOwnProperty(item)) {
- throw new SyntaxError("invalid or unsupported Unicode item: " + match[0]);
- }
- return scope === "class" ?
- (inv ? cacheInversion(item) : unicode[item]) :
- "[" + inv + unicode[item] + "]";
- },
- {scope: "all"}
- );
-
/**
- * Adds to the list of XRegExp Unicode tokens that can be used via \p{..} or \P{..}.
+ * Adds to the list of Unicode properties that XRegExp regexes can match via \p{..} or \P{..}.
* @memberOf XRegExp
* @param {Object} pack Named sets of Unicode code points and ranges.
* @param {Object} [aliases] Aliases for the primary token names.
@@ -1334,13 +1314,58 @@ XRegExp = XRegExp || (function (undef) {
}
};
-// Adds Unicode Letter category data (use addon packages for other categories, scripts, and blocks)
+/* Adds data for the Unicode `Letter` category. Addon packages include other categories, scripts,
+ * blocks, and properties.
+ */
XRegExp.addUnicodePackage({
L: "0041-005A0061-007A00AA00B500BA00C0-00D600D8-00F600F8-02C102C6-02D102E0-02E402EC02EE0370-037403760377037A-037D03860388-038A038C038E-03A103A3-03F503F7-0481048A-05270531-055605590561-058705D0-05EA05F0-05F20620-064A066E066F0671-06D306D506E506E606EE06EF06FA-06FC06FF07100712-072F074D-07A507B107CA-07EA07F407F507FA0800-0815081A082408280840-085808A008A2-08AC0904-0939093D09500958-09610971-09770979-097F0985-098C098F09900993-09A809AA-09B009B209B6-09B909BD09CE09DC09DD09DF-09E109F009F10A05-0A0A0A0F0A100A13-0A280A2A-0A300A320A330A350A360A380A390A59-0A5C0A5E0A72-0A740A85-0A8D0A8F-0A910A93-0AA80AAA-0AB00AB20AB30AB5-0AB90ABD0AD00AE00AE10B05-0B0C0B0F0B100B13-0B280B2A-0B300B320B330B35-0B390B3D0B5C0B5D0B5F-0B610B710B830B85-0B8A0B8E-0B900B92-0B950B990B9A0B9C0B9E0B9F0BA30BA40BA8-0BAA0BAE-0BB90BD00C05-0C0C0C0E-0C100C12-0C280C2A-0C330C35-0C390C3D0C580C590C600C610C85-0C8C0C8E-0C900C92-0CA80CAA-0CB30CB5-0CB90CBD0CDE0CE00CE10CF10CF20D05-0D0C0D0E-0D100D12-0D3A0D3D0D4E0D600D610D7A-0D7F0D85-0D960D9A-0DB10DB3-0DBB0DBD0DC0-0DC60E01-0E300E320E330E40-0E460E810E820E840E870E880E8A0E8D0E94-0E970E99-0E9F0EA1-0EA30EA50EA70EAA0EAB0EAD-0EB00EB20EB30EBD0EC0-0EC40EC60EDC-0EDF0F000F40-0F470F49-0F6C0F88-0F8C1000-102A103F1050-1055105A-105D106110651066106E-10701075-1081108E10A0-10C510C710CD10D0-10FA10FC-1248124A-124D1250-12561258125A-125D1260-1288128A-128D1290-12B012B2-12B512B8-12BE12C012C2-12C512C8-12D612D8-13101312-13151318-135A1380-138F13A0-13F41401-166C166F-167F1681-169A16A0-16EA1700-170C170E-17111720-17311740-17511760-176C176E-17701780-17B317D717DC1820-18771880-18A818AA18B0-18F51900-191C1950-196D1970-19741980-19AB19C1-19C71A00-1A161A20-1A541AA71B05-1B331B45-1B4B1B83-1BA01BAE1BAF1BBA-1BE51C00-1C231C4D-1C4F1C5A-1C7D1CE9-1CEC1CEE-1CF11CF51CF61D00-1DBF1E00-1F151F18-1F1D1F20-1F451F48-1F4D1F50-1F571F591F5B1F5D1F5F-1F7D1F80-1FB41FB6-1FBC1FBE1FC2-1FC41FC6-1FCC1FD0-1FD31FD6-1FDB1FE0-1FEC1FF2-1FF41FF6-1FFC2071207F2090-209C21022107210A-211321152119-211D212421262128212A-212D212F-2139213C-213F2145-2149214E218321842C00-2C2E2C30-2C5E2C60-2CE42CEB-2CEE2CF22CF32D00-2D252D272D2D2D30-2D672D6F2D80-2D962DA0-2DA62DA8-2DAE2DB0-2DB62DB8-2DBE2DC0-2DC62DC8-2DCE2DD0-2DD62DD8-2DDE2E2F300530063031-3035303B303C3041-3096309D-309F30A1-30FA30FC-30FF3105-312D3131-318E31A0-31BA31F0-31FF3400-4DB54E00-9FCCA000-A48CA4D0-A4FDA500-A60CA610-A61FA62AA62BA640-A66EA67F-A697A6A0-A6E5A717-A71FA722-A788A78B-A78EA790-A793A7A0-A7AAA7F8-A801A803-A805A807-A80AA80C-A822A840-A873A882-A8B3A8F2-A8F7A8FBA90A-A925A930-A946A960-A97CA984-A9B2A9CFAA00-AA28AA40-AA42AA44-AA4BAA60-AA76AA7AAA80-AAAFAAB1AAB5AAB6AAB9-AABDAAC0AAC2AADB-AADDAAE0-AAEAAAF2-AAF4AB01-AB06AB09-AB0EAB11-AB16AB20-AB26AB28-AB2EABC0-ABE2AC00-D7A3D7B0-D7C6D7CB-D7FBF900-FA6DFA70-FAD9FB00-FB06FB13-FB17FB1DFB1F-FB28FB2A-FB36FB38-FB3CFB3EFB40FB41FB43FB44FB46-FBB1FBD3-FD3DFD50-FD8FFD92-FDC7FDF0-FDFBFE70-FE74FE76-FEFCFF21-FF3AFF41-FF5AFF66-FFBEFFC2-FFC7FFCA-FFCFFFD2-FFD7FFDA-FFDC"
}, {
L: "Letter"
});
+/* Adds Unicode property syntax to XRegExp: \p{..}, \P{..}, \p{^..}
+ */
+ XRegExp.addToken(
+ /\\([pP]){(\^?)([^}]*)}/,
+ function (match, scope) {
+ var inv = (match[1] === "P" || match[2]) ? "^" : "",
+ item = slug(match[3]);
+ // The double negative \P{^..} is invalid
+ if (match[1] === "P" && match[2]) {
+ throw new SyntaxError("erroneous characters: " + match[0]);
+ }
+ if (!unicode.hasOwnProperty(item)) {
+ throw new SyntaxError("invalid or unsupported Unicode property: " + match[0]);
+ }
+ return scope === "class" ?
+ (inv ? cacheInversion(item) : unicode[item]) :
+ "[" + inv + unicode[item] + "]";
+ },
+ {scope: "all"}
+ );
+
+/* Adds Unicode code point syntax to XRegExp: \u{n..}
+ * `n..` is any 1-6 digit 21-bit hexadecimal code point from 0-10FFFF. Code points above FFFF are
+ * converted to surrogate pairs.
+ */
+ XRegExp.addToken(
+ /\\u{([0-9A-Fa-f]{1,6})}/,
+ function (match) {
+ var code = dec(match[1]), offset;
+ if (code > 0x10FFFF) {
+ throw new RangeError("invalid Unicode code point: " + match[0]);
+ }
+ // Converting to \uNNNN when possible avoids needing to escape the character and keep
+ // it separate from preceding tokens
+ if (code <= 0xFFFF) {
+ return "\\u" + pad4(hex(code));
+ }
+ offset = code - 0x10000;
+ return String.fromCharCode(0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)); // Surrogate pair
+ },
+ {scope: "all"}
+ );
+
}(XRegExp));

0 comments on commit d94c44f

Please sign in to comment.