-
-
Notifications
You must be signed in to change notification settings - Fork 278
/
unicode-base.js
235 lines (212 loc) · 14.2 KB
/
unicode-base.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
/*!
* XRegExp Unicode Base 3.0.0-pre
* <http://xregexp.com/>
* Steven Levithan © 2008-2012 MIT License
* Uses Unicode 6.2.0 <http://unicode.org/>
* Unicode data generated by Mathias Bynens <http://mathiasbynens.be/>
*/
/**
* Adds support for the `\p{L}` or `\p{Letter}` Unicode category. Addon packages for other Unicode
* categories, scripts, blocks, and properties are available separately. All Unicode tokens can be
* inverted using `\P{..}` or `\p{^..}`. Token names are case insensitive, and any spaces, hyphens,
* and underscores are ignored.
* @requires XRegExp
*/
(function(XRegExp) {
'use strict';
// Storage for Unicode data
var unicode = {};
/*--------------------------------------
* Private functions
*------------------------------------*/
// Generates a token lookup name: lowercase, with hyphens, spaces, and underscores removed
function normalize(name) {
return name.replace(/[- _]+/g, '').toLowerCase();
}
// Adds leading zeros if shorter than four characters
function pad4(str) {
while (str.length < 4) {
str = '0' + str;
}
return str;
}
// Converts a hexadecimal number to decimal
function dec(hex) {
return parseInt(hex, 16);
}
// Converts a decimal number to hexadecimal
function hex(dec) {
return parseInt(dec, 10).toString(16);
}
// Gets the decimal code of a literal code unit, \xHH, \uHHHH, or a backslash-escaped literal
function charCode(chr) {
var esc = /^\\[xu](.+)/.exec(chr);
return esc ?
dec(esc[1]) :
chr.charCodeAt(chr.charAt(0) === '\\' ? 1 : 0);
}
// Inverts a list of ordered BMP characters and ranges
function invertBmp(range) {
var output = '',
lastEnd = -1,
start;
XRegExp.forEach(range, /(\\x..|\\u....|\\?[\s\S])(?:-(\\x..|\\u....|\\?[\s\S]))?/, function(m) {
start = charCode(m[1]);
if (start > (lastEnd + 1)) {
output += '\\u' + pad4(hex(lastEnd + 1));
if (start > (lastEnd + 2)) {
output += '-\\u' + pad4(hex(start - 1));
}
}
lastEnd = charCode(m[2] || m[1]);
});
if (lastEnd < 0xFFFF) {
output += '\\u' + pad4(hex(lastEnd + 1));
if (lastEnd < 0xFFFE) {
output += '-\\uFFFF';
}
}
return output;
}
// Generates an inverted BMP range on first use
function cacheInvertedBmp(slug) {
var prop = 'b!';
return unicode[slug][prop] || (
unicode[slug][prop] = invertBmp(unicode[slug].bmp)
);
}
// Combines and optionally negates BMP and astral data
function buildAstral(slug, isNegated) {
var item = unicode[slug],
combined = '';
if (item.bmp && !item.isBmpLast) {
combined = '[' + item.bmp + ']' + (item.astral ? '|' : '');
}
if (item.astral) {
combined += item.astral;
}
if (item.isBmpLast && item.bmp) {
combined += (item.astral ? '|' : '') + '[' + item.bmp + ']';
}
// Astral Unicode tokens always match a code point, never a code unit
return isNegated ?
'(?:(?!' + combined + ')(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[\0-\uFFFF]))' :
'(?:' + combined + ')';
}
// Builds a complete astral pattern on first use
function cacheAstral(slug, isNegated) {
var prop = isNegated ? 'a!' : 'a=';
return unicode[slug][prop] || (
unicode[slug][prop] = buildAstral(slug, isNegated)
);
}
/*--------------------------------------
* Core functionality
*------------------------------------*/
/* Add Unicode token syntax: \p{..}, \P{..}, \p{^..}. Also add astral mode (flag A).
*/
XRegExp.addToken(
// Use `*` instead of `+` to avoid capturing `^` as the token name in `\p{^}`
/\\([pP])(?:{(\^?)([^}]*)}|([A-Za-z]))/,
function(match, scope, flags) {
var ERR_DOUBLE_NEG = 'Invalid double negation ',
ERR_UNKNOWN_NAME = 'Unknown Unicode token ',
ERR_UNKNOWN_REF = 'Unicode token missing data ',
ERR_ASTRAL_ONLY = 'Astral mode required for Unicode token ',
ERR_ASTRAL_IN_CLASS = 'Astral mode does not support Unicode tokens within character classes',
// Negated via \P{..} or \p{^..}
isNegated = match[1] === 'P' || !!match[2],
// Switch from BMP (U+FFFF) to astral (U+10FFFF) mode via flag A or implicit opt-in
isAstralMode = flags.indexOf('A') > -1 || XRegExp.isInstalled('astral'),
// Token lookup name. Check `[4]` first to avoid passing `undefined` via `\p{}`
slug = normalize(match[4] || match[3]),
// Token data object
item = unicode[slug];
if (match[1] === 'P' && match[2]) {
throw new SyntaxError(ERR_DOUBLE_NEG + match[0]);
}
if (!unicode.hasOwnProperty(slug)) {
throw new SyntaxError(ERR_UNKNOWN_NAME + match[0]);
}
// Switch to the negated form of the referenced Unicode token
if (item.inverseOf) {
slug = normalize(item.inverseOf);
if (!unicode.hasOwnProperty(slug)) {
throw new ReferenceError(ERR_UNKNOWN_REF + match[0] + ' -> ' + item.inverseOf);
}
item = unicode[slug];
isNegated = !isNegated;
}
if (!(item.bmp || isAstralMode)) {
throw new SyntaxError(ERR_ASTRAL_ONLY + match[0]);
}
if (isAstralMode) {
if (scope === 'class') {
throw new SyntaxError(ERR_ASTRAL_IN_CLASS);
}
return cacheAstral(slug, isNegated);
}
return scope === 'class' ?
(isNegated ? cacheInvertedBmp(slug) : item.bmp) :
(isNegated ? '[^' : '[') + item.bmp + ']';
},
{
scope: 'all',
optionalFlags: 'A'
}
);
/**
* Adds to the list of Unicode tokens that XRegExp regexes can match via `\p` or `\P`.
* @memberOf XRegExp
* @param {Array} data Objects with named character ranges. Each object may have properties `name`,
* `alias`, `isBmpLast`, `inverseOf`, `bmp`, and `astral`. All but `name` are optional, although
* one of `bmp` or `astral` is required (unless `inverseOf` is set). If `astral` is absent, the
* `bmp` data is used for BMP and astral modes. If `bmp` is absent, the name errors in BMP mode
* but works in astral mode. If both `bmp` and `astral` are provided, the `bmp` data only is used
* in BMP mode, and the combination of `bmp` and `astral` data is used in astral mode.
* `isBmpLast` is needed when a token matches orphan high surrogates *and* uses surrogate pairs
* to match astral code points. The `bmp` and `astral` data should be a combination of literal
* characters and `\xHH` or `\uHHHH` escape sequences, with hyphens to create ranges. Any regex
* metacharacters in the data should be escaped, apart from range-creating hyphens. The `astral`
* data can additionally use character classes and alternation, and should use surrogate pairs to
* represent astral code points. `inverseOf` can be used to avoid duplicating character data if a
* Unicode token is defined as the exact inverse of another token.
* @example
*
* // Basic use
* XRegExp.addUnicodeData([{
* name: 'XDigit',
* alias: 'Hexadecimal',
* bmp: '0-9A-Fa-f'
* }]);
* XRegExp('\\p{XDigit}:\\p{Hexadecimal}+').test('0:3D'); // -> true
*/
XRegExp.addUnicodeData = function(data) {
var ERR_NO_NAME = 'Unicode token requires name',
ERR_NO_DATA = 'Unicode token has no character data ',
item,
i;
for (i = 0; i < data.length; ++i) {
item = data[i];
if (!item.name) {
throw new Error(ERR_NO_NAME);
}
if (!(item.inverseOf || item.bmp || item.astral)) {
throw new Error(ERR_NO_DATA + item.name);
}
unicode[normalize(item.name)] = item;
if (item.alias) {
unicode[normalize(item.alias)] = item;
}
}
};
/* Add data for the Unicode `L` or `Letter` category. Separate addons are available that add other
* categories, scripts, blocks, and properties.
*/
XRegExp.addUnicodeData([{
name: 'L',
alias: 'Letter',
bmp: 'A-Za-z\xAA\xB5\xBA\xC0-\xD6\xD8-\xF6\xF8-\u02C1\u02C6-\u02D1\u02E0-\u02E4\u02EC\u02EE\u0370-\u0374\u0376\u0377\u037A-\u037D\u0386\u0388-\u038A\u038C\u038E-\u03A1\u03A3-\u03F5\u03F7-\u0481\u048A-\u0527\u0531-\u0556\u0559\u0561-\u0587\u05D0-\u05EA\u05F0-\u05F2\u0620-\u064A\u066E\u066F\u0671-\u06D3\u06D5\u06E5\u06E6\u06EE\u06EF\u06FA-\u06FC\u06FF\u0710\u0712-\u072F\u074D-\u07A5\u07B1\u07CA-\u07EA\u07F4\u07F5\u07FA\u0800-\u0815\u081A\u0824\u0828\u0840-\u0858\u08A0\u08A2-\u08AC\u0904-\u0939\u093D\u0950\u0958-\u0961\u0971-\u0977\u0979-\u097F\u0985-\u098C\u098F\u0990\u0993-\u09A8\u09AA-\u09B0\u09B2\u09B6-\u09B9\u09BD\u09CE\u09DC\u09DD\u09DF-\u09E1\u09F0\u09F1\u0A05-\u0A0A\u0A0F\u0A10\u0A13-\u0A28\u0A2A-\u0A30\u0A32\u0A33\u0A35\u0A36\u0A38\u0A39\u0A59-\u0A5C\u0A5E\u0A72-\u0A74\u0A85-\u0A8D\u0A8F-\u0A91\u0A93-\u0AA8\u0AAA-\u0AB0\u0AB2\u0AB3\u0AB5-\u0AB9\u0ABD\u0AD0\u0AE0\u0AE1\u0B05-\u0B0C\u0B0F\u0B10\u0B13-\u0B28\u0B2A-\u0B30\u0B32\u0B33\u0B35-\u0B39\u0B3D\u0B5C\u0B5D\u0B5F-\u0B61\u0B71\u0B83\u0B85-\u0B8A\u0B8E-\u0B90\u0B92-\u0B95\u0B99\u0B9A\u0B9C\u0B9E\u0B9F\u0BA3\u0BA4\u0BA8-\u0BAA\u0BAE-\u0BB9\u0BD0\u0C05-\u0C0C\u0C0E-\u0C10\u0C12-\u0C28\u0C2A-\u0C33\u0C35-\u0C39\u0C3D\u0C58\u0C59\u0C60\u0C61\u0C85-\u0C8C\u0C8E-\u0C90\u0C92-\u0CA8\u0CAA-\u0CB3\u0CB5-\u0CB9\u0CBD\u0CDE\u0CE0\u0CE1\u0CF1\u0CF2\u0D05-\u0D0C\u0D0E-\u0D10\u0D12-\u0D3A\u0D3D\u0D4E\u0D60\u0D61\u0D7A-\u0D7F\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6\u0E01-\u0E30\u0E32\u0E33\u0E40-\u0E46\u0E81\u0E82\u0E84\u0E87\u0E88\u0E8A\u0E8D\u0E94-\u0E97\u0E99-\u0E9F\u0EA1-\u0EA3\u0EA5\u0EA7\u0EAA\u0EAB\u0EAD-\u0EB0\u0EB2\u0EB3\u0EBD\u0EC0-\u0EC4\u0EC6\u0EDC-\u0EDF\u0F00\u0F40-\u0F47\u0F49-\u0F6C\u0F88-\u0F8C\u1000-\u102A\u103F\u1050-\u1055\u105A-\u105D\u1061\u1065\u1066\u106E-\u1070\u1075-\u1081\u108E\u10A0-\u10C5\u10C7\u10CD\u10D0-\u10FA\u10FC-\u1248\u124A-\u124D\u1250-\u1256\u1258\u125A-\u125D\u1260-\u1288\u128A-\u128D\u1290-\u12B0\u12B2-\u12B5\u12B8-\u12BE\u12C0\u12C2-\u12C5\u12C8-\u12D6\u12D8-\u1310\u1312-\u1315\u1318-\u135A\u1380-\u138F\u13A0-\u13F4\u1401-\u166C\u166F-\u167F\u1681-\u169A\u16A0-\u16EA\u1700-\u170C\u170E-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176C\u176E-\u1770\u1780-\u17B3\u17D7\u17DC\u1820-\u1877\u1880-\u18A8\u18AA\u18B0-\u18F5\u1900-\u191C\u1950-\u196D\u1970-\u1974\u1980-\u19AB\u19C1-\u19C7\u1A00-\u1A16\u1A20-\u1A54\u1AA7\u1B05-\u1B33\u1B45-\u1B4B\u1B83-\u1BA0\u1BAE\u1BAF\u1BBA-\u1BE5\u1C00-\u1C23\u1C4D-\u1C4F\u1C5A-\u1C7D\u1CE9-\u1CEC\u1CEE-\u1CF1\u1CF5\u1CF6\u1D00-\u1DBF\u1E00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC2-\u1FC4\u1FC6-\u1FCC\u1FD0-\u1FD3\u1FD6-\u1FDB\u1FE0-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2071\u207F\u2090-\u209C\u2102\u2107\u210A-\u2113\u2115\u2119-\u211D\u2124\u2126\u2128\u212A-\u212D\u212F-\u2139\u213C-\u213F\u2145-\u2149\u214E\u2183\u2184\u2C00-\u2C2E\u2C30-\u2C5E\u2C60-\u2CE4\u2CEB-\u2CEE\u2CF2\u2CF3\u2D00-\u2D25\u2D27\u2D2D\u2D30-\u2D67\u2D6F\u2D80-\u2D96\u2DA0-\u2DA6\u2DA8-\u2DAE\u2DB0-\u2DB6\u2DB8-\u2DBE\u2DC0-\u2DC6\u2DC8-\u2DCE\u2DD0-\u2DD6\u2DD8-\u2DDE\u2E2F\u3005\u3006\u3031-\u3035\u303B\u303C\u3041-\u3096\u309D-\u309F\u30A1-\u30FA\u30FC-\u30FF\u3105-\u312D\u3131-\u318E\u31A0-\u31BA\u31F0-\u31FF\u3400-\u4DB5\u4E00-\u9FCC\uA000-\uA48C\uA4D0-\uA4FD\uA500-\uA60C\uA610-\uA61F\uA62A\uA62B\uA640-\uA66E\uA67F-\uA697\uA6A0-\uA6E5\uA717-\uA71F\uA722-\uA788\uA78B-\uA78E\uA790-\uA793\uA7A0-\uA7AA\uA7F8-\uA801\uA803-\uA805\uA807-\uA80A\uA80C-\uA822\uA840-\uA873\uA882-\uA8B3\uA8F2-\uA8F7\uA8FB\uA90A-\uA925\uA930-\uA946\uA960-\uA97C\uA984-\uA9B2\uA9CF\uAA00-\uAA28\uAA40-\uAA42\uAA44-\uAA4B\uAA60-\uAA76\uAA7A\uAA80-\uAAAF\uAAB1\uAAB5\uAAB6\uAAB9-\uAABD\uAAC0\uAAC2\uAADB-\uAADD\uAAE0-\uAAEA\uAAF2-\uAAF4\uAB01-\uAB06\uAB09-\uAB0E\uAB11-\uAB16\uAB20-\uAB26\uAB28-\uAB2E\uABC0-\uABE2\uAC00-\uD7A3\uD7B0-\uD7C6\uD7CB-\uD7FB\uF900-\uFA6D\uFA70-\uFAD9\uFB00-\uFB06\uFB13-\uFB17\uFB1D\uFB1F-\uFB28\uFB2A-\uFB36\uFB38-\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46-\uFBB1\uFBD3-\uFD3D\uFD50-\uFD8F\uFD92-\uFDC7\uFDF0-\uFDFB\uFE70-\uFE74\uFE76-\uFEFC\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC',
astral: '\uD802[\uDC00-\uDC05\uDC08\uDC0A-\uDC35\uDC37\uDC38\uDC3C\uDC3F-\uDC55\uDD00-\uDD15\uDD20-\uDD39\uDD80-\uDDB7\uDDBE\uDDBF\uDE00\uDE10-\uDE13\uDE15-\uDE17\uDE19-\uDE33\uDE60-\uDE7C\uDF00-\uDF35\uDF40-\uDF55\uDF60-\uDF72]|\uD801[\uDC00-\uDC9D]|\uD800[\uDC00-\uDC0B\uDC0D-\uDC26\uDC28-\uDC3A\uDC3C\uDC3D\uDC3F-\uDC4D\uDC50-\uDC5D\uDC80-\uDCFA\uDE80-\uDE9C\uDEA0-\uDED0\uDF00-\uDF1E\uDF30-\uDF40\uDF42-\uDF49\uDF80-\uDF9D\uDFA0-\uDFC3\uDFC8-\uDFCF]|\uD81A[\uDC00-\uDE38]|\uD804[\uDC03-\uDC37\uDC83-\uDCAF\uDCD0-\uDCE8\uDD03-\uDD26\uDD83-\uDDB2\uDDC1-\uDDC4]|\uD86E[\uDC00-\uDC1D]|\uD86D[\uDC00-\uDF34\uDF40-\uDFFF]|\uD803[\uDC00-\uDC48]|\uD80D[\uDC00-\uDC2E]|\uD805[\uDE80-\uDEAA]|\uD87E[\uDC00-\uDE1D]|\uD81B[\uDF00-\uDF44\uDF50\uDF93-\uDF9F]|\uD869[\uDC00-\uDED6\uDF00-\uDFFF]|\uD82C[\uDC00\uDC01]|[\uD80C\uD840-\uD868\uD86A-\uD86C][\uDC00-\uDFFF]|\uD83B[\uDE00-\uDE03\uDE05-\uDE1F\uDE21\uDE22\uDE24\uDE27\uDE29-\uDE32\uDE34-\uDE37\uDE39\uDE3B\uDE42\uDE47\uDE49\uDE4B\uDE4D-\uDE4F\uDE51\uDE52\uDE54\uDE57\uDE59\uDE5B\uDE5D\uDE5F\uDE61\uDE62\uDE64\uDE67-\uDE6A\uDE6C-\uDE72\uDE74-\uDE77\uDE79-\uDE7C\uDE7E\uDE80-\uDE89\uDE8B-\uDE9B\uDEA1-\uDEA3\uDEA5-\uDEA9\uDEAB-\uDEBB]|\uD835[\uDC00-\uDC54\uDC56-\uDC9C\uDC9E\uDC9F\uDCA2\uDCA5\uDCA6\uDCA9-\uDCAC\uDCAE-\uDCB9\uDCBB\uDCBD-\uDCC3\uDCC5-\uDD05\uDD07-\uDD0A\uDD0D-\uDD14\uDD16-\uDD1C\uDD1E-\uDD39\uDD3B-\uDD3E\uDD40-\uDD44\uDD46\uDD4A-\uDD50\uDD52-\uDEA5\uDEA8-\uDEC0\uDEC2-\uDEDA\uDEDC-\uDEFA\uDEFC-\uDF14\uDF16-\uDF34\uDF36-\uDF4E\uDF50-\uDF6E\uDF70-\uDF88\uDF8A-\uDFA8\uDFAA-\uDFC2\uDFC4-\uDFCB]|\uD808[\uDC00-\uDF6E]'
}]);
}(XRegExp));