Skip to content

Commit

Permalink
Support Unicode (#151)
Browse files Browse the repository at this point in the history
* Support Unicode international characters
  - Punycode domains
  - utilize .codePointAt instead of .charCodeAt
  - tighten restricted codes to control characters instead of non-latin ranges
* Expand test coverage of Unicode cases
* Add normalization to email addresses
  - adds UTF-16 surrogate pair characters to tests.json
  - adds exported `.normalize` method and associated test
  - adds test in `tmp.js` for punycoded normalized characters
* Monkey-patch V8 NUL normalize bug for Node 4.x
* Add unicode length tests
* Handle for surrogate pairs in token iteration
* Check Buffer.byteLength instead of string.length
  • Loading branch information
WesTyler authored and skeggse committed Feb 15, 2017
1 parent 29e61e1 commit 0f795c5
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 47 deletions.
147 changes: 103 additions & 44 deletions lib/index.js
Expand Up @@ -3,7 +3,7 @@
// Load modules

const Dns = require('dns');

const Punycode = require('punycode');

// Declare internals

Expand Down Expand Up @@ -127,12 +127,27 @@ internals.specials = function () {

const specials = '()<>[]:;@\\,."'; // US-ASCII visible characters not valid for atext (http://tools.ietf.org/html/rfc5322#section-3.2.3)
const lookup = new Array(0x100);
for (let i = 0xff; i >= 0; --i) {
lookup[i] = false;
}
lookup.fill(false);

for (let i = 0; i < specials.length; ++i) {
lookup[specials.charCodeAt(i)] = true;
lookup[specials.codePointAt(i)] = true;
}

return function (code) {

return lookup[code];
};
}();

internals.c0Controls = function () {

const lookup = new Array(0x100);
lookup.fill(false);

// add C0 control characters

for (let i = 0; i < 33; ++i) {
lookup[i] = true;
}

return function (code) {
Expand All @@ -141,12 +156,41 @@ internals.specials = function () {
};
}();

internals.c1Controls = function () {

const lookup = new Array(0x100);
lookup.fill(false);

// add C1 control characters

for (let i = 127; i < 160; ++i) {
lookup[i] = true;
}

return function (code) {

return lookup[code];
};
}();

internals.regex = {
ipV4: /\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)$/,
ipV6: /^[a-fA-F\d]{0,4}$/
};

// $lab:coverage:off$
internals.nulNormalize = function (email) {

let emailPieces = email.split('\u0000');
emailPieces = emailPieces.map((string) => {

return string.normalize('NFC');
});

return emailPieces.join('\u0000');
};
// $lab:coverage:on$


internals.checkIpV6 = function (items) {

Expand All @@ -173,7 +217,7 @@ internals.validDomain = function (tldAtom, options) {


/**
* Check that an email address conforms to RFCs 5321, 5322 and others
* Check that an email address conforms to RFCs 5321, 5322, 6530 and others
*
* We distinguish clearly between a Mailbox as defined by RFC 5321 and an
* addr-spec as defined by RFC 5322. Depending on the context, either can be
Expand All @@ -197,6 +241,7 @@ internals.validDomain = function (tldAtom, options) {
exports.validate = internals.validate = function (email, options, callback) {

options = options || {};
email = internals.normalize(email);

if (typeof options === 'function') {
callback = options;
Expand Down Expand Up @@ -281,8 +326,9 @@ exports.validate = internals.validate = function (email, options, callback) {
const emailLength = email.length;

let token; // Token is used outside the loop, must declare similarly
for (let i = 0; i < emailLength; ++i) {
token = email[i];
for (let i = 0; i < emailLength; i += token.length) {
// Utilize codepoints to account for Unicode surrogate pairs
token = String.fromCodePoint(email.codePointAt(i));

switch (context.now) {
// Local-part
Expand Down Expand Up @@ -350,7 +396,7 @@ exports.validate = internals.validate = function (email, options, callback) {

parseData.local += token;
atomData.locals[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');

// Quoted string must be the entire element
assertEnd = true;
Expand Down Expand Up @@ -406,7 +452,7 @@ exports.validate = internals.validate = function (email, options, callback) {
}
// http://tools.ietf.org/html/rfc5321#section-4.5.3.1.1 the maximum total length of a user name or other local-part is 64
// octets
else if (parseData.local.length > 64) {
else if (Buffer.byteLength(parseData.local, 'utf8') > 64) {
updateResult(internals.diagnoses.rfc5322LocalTooLong);
}
// http://tools.ietf.org/html/rfc5322#section-3.4.1 comments and folding white space SHOULD NOT be used around "@" in the
Expand Down Expand Up @@ -462,18 +508,18 @@ exports.validate = internals.validate = function (email, options, callback) {
}
else {
context.prev = context.now;
charCode = token.charCodeAt(0);
charCode = token.codePointAt(0);

// Especially if charCode == 10
if (charCode < 33 || charCode > 126 || internals.specials(charCode)) {
if (internals.specials(charCode) || internals.c0Controls(charCode) || internals.c1Controls(charCode)) {

// Fatal error
updateResult(internals.diagnoses.errExpectingATEXT);
}

parseData.local += token;
atomData.locals[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
}
}

Expand Down Expand Up @@ -576,7 +622,7 @@ exports.validate = internals.validate = function (email, options, callback) {
if (parseData.domain.length === 0) {
// Domain literal must be the only component
assertEnd = true;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
context.stack.push(context.now);
context.now = internals.components.literal;
parseData.domain += token;
Expand Down Expand Up @@ -660,11 +706,11 @@ exports.validate = internals.validate = function (email, options, callback) {
}
}

charCode = token.charCodeAt(0);
charCode = token.codePointAt(0);
// Assume this token isn't a hyphen unless we discover it is
hyphenFlag = false;

if (charCode < 33 || charCode > 126 || internals.specials(charCode)) {
if (internals.specials(charCode) || internals.c0Controls(charCode) || internals.c1Controls(charCode)) {
// Fatal error
updateResult(internals.diagnoses.errExpectingATEXT);
}
Expand All @@ -676,15 +722,15 @@ exports.validate = internals.validate = function (email, options, callback) {

hyphenFlag = true;
}
// Check if it's a neither a number nor a latin letter
else if (charCode < 48 || charCode > 122 || (charCode > 57 && charCode < 65) || (charCode > 90 && charCode < 97)) {
// Check if it's a neither a number nor a latin/unicode letter
else if (charCode < 48 || (charCode > 122 && charCode < 192) || (charCode > 57 && charCode < 65) || (charCode > 90 && charCode < 97)) {
// This is not an RFC 5321 subdomain, but still OK by RFC 5322
updateResult(internals.diagnoses.rfc5322Domain);
}

parseData.domain += token;
atomData.domains[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
}

break;
Expand Down Expand Up @@ -821,7 +867,7 @@ exports.validate = internals.validate = function (email, options, callback) {

parseData.domain += token;
atomData.domains[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
context.prev = context.now;
context.now = context.stack.pop();
break;
Expand Down Expand Up @@ -864,22 +910,22 @@ exports.validate = internals.validate = function (email, options, callback) {
// %d12 / ; include the carriage
// %d14-31 / ; return, line feed, and
// %d127 ; white space characters
charCode = token.charCodeAt(0);
charCode = token.codePointAt(0);

// '\r', '\n', ' ', and '\t' have already been parsed above
if (charCode > 127 || charCode === 0 || token === '[') {
if ((charCode !== 127 && internals.c1Controls(charCode)) || charCode === 0 || token === '[') {
// Fatal error
updateResult(internals.diagnoses.errExpectingDTEXT);
break;
}
else if (charCode < 33 || charCode === 127) {
else if (internals.c0Controls(charCode) || charCode === 127) {
updateResult(internals.diagnoses.rfc5322DomainLiteralOBSDText);
}

parseData.literal += token;
parseData.domain += token;
atomData.domains[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
}

break;
Expand Down Expand Up @@ -922,7 +968,7 @@ exports.validate = internals.validate = function (email, options, callback) {

parseData.local += ' ';
atomData.locals[elementCount] += ' ';
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');

updateResult(internals.diagnoses.cfwsFWS);
context.stack.push(context.now);
Expand All @@ -934,7 +980,7 @@ exports.validate = internals.validate = function (email, options, callback) {
case '"':
parseData.local += token;
atomData.locals[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
context.prev = context.now;
context.now = context.stack.pop();
break;
Expand All @@ -954,18 +1000,18 @@ exports.validate = internals.validate = function (email, options, callback) {
// %d12 / ; include the carriage
// %d14-31 / ; return, line feed, and
// %d127 ; white space characters
charCode = token.charCodeAt(0);
charCode = token.codePointAt(0);

if (charCode > 127 || charCode === 0 || charCode === 10) {
if ((charCode !== 127 && internals.c1Controls(charCode)) || charCode === 0 || charCode === 10) {
updateResult(internals.diagnoses.errExpectingQTEXT);
}
else if (charCode < 32 || charCode === 127) {
else if (internals.c0Controls(charCode) || charCode === 127) {
updateResult(internals.diagnoses.deprecatedQTEXT);
}

parseData.local += token;
atomData.locals[elementCount] += token;
++elementLength;
elementLength += Buffer.byteLength(token, 'utf8');
}

// http://tools.ietf.org/html/rfc5322#section-3.4.1
Expand All @@ -992,9 +1038,9 @@ exports.validate = internals.validate = function (email, options, callback) {
// %d127 ; white space characters
//
// i.e. obs-qp = "\" (%d0-8, %d10-31 / %d127)
charCode = token.charCodeAt(0);
charCode = token.codePointAt(0);

if (charCode > 127) {
if (charCode !== 127 && internals.c1Controls(charCode)) {
// Fatal error
updateResult(internals.diagnoses.errExpectingQPair);
}
Expand All @@ -1010,23 +1056,23 @@ exports.validate = internals.validate = function (email, options, callback) {
context.prev = context.now;
// End of qpair
context.now = context.stack.pop();
token = '\\' + token;
const escapeToken = '\\' + token;

switch (context.now) {
case internals.components.contextComment:
break;

case internals.components.contextQuotedString:
parseData.local += token;
atomData.locals[elementCount] += token;
parseData.local += escapeToken;
atomData.locals[elementCount] += escapeToken;

// The maximum sizes specified by RFC 5321 are octet counts, so we must include the backslash
elementLength += 2;
break;

case internals.components.literal:
parseData.domain += token;
atomData.domains[elementCount] += token;
parseData.domain += escapeToken;
atomData.domains[elementCount] += escapeToken;

// The maximum sizes specified by RFC 5321 are octet counts, so we must include the backslash
elementLength += 2;
Expand Down Expand Up @@ -1099,14 +1145,14 @@ exports.validate = internals.validate = function (email, options, callback) {
// %d12 / ; include the carriage
// %d14-31 / ; return, line feed, and
// %d127 ; white space characters
charCode = token.charCodeAt(0);
charCode = token.codePointAt(0);

if (charCode > 127 || charCode === 0 || charCode === 10) {
if (charCode === 0 || charCode === 10 || (charCode !== 127 && internals.c1Controls(charCode))) {
// Fatal error
updateResult(internals.diagnoses.errExpectingCTEXT);
break;
}
else if (charCode < 32 || charCode === 127) {
else if (internals.c0Controls(charCode) || charCode === 127) {
updateResult(internals.diagnoses.deprecatedCTEXT);
}
}
Expand Down Expand Up @@ -1219,12 +1265,12 @@ exports.validate = internals.validate = function (email, options, callback) {
}

// Other errors
else if (parseData.domain.length > 255) {
else if (Buffer.byteLength(parseData.domain, 'utf8') > 255) {
// http://tools.ietf.org/html/rfc5321#section-4.5.3.1.2
// The maximum total length of a domain name or number is 255 octets.
updateResult(internals.diagnoses.rfc5322DomainTooLong);
}
else if (parseData.local.length + parseData.domain.length + /* '@' */ 1 > 254) {
else if (Buffer.byteLength(parseData.local, 'utf8') + Buffer.byteLength(parseData.domain, 'utf8') + /* '@' */ 1 > 254) {
// http://tools.ietf.org/html/rfc5321#section-4.1.2
// Forward-path = Path
//
Expand Down Expand Up @@ -1266,7 +1312,7 @@ exports.validate = internals.validate = function (email, options, callback) {

if (!dnsPositive && maxResult < internals.categories.dnsWarn) {
// Per RFC 5321, domain atoms are limited to letter-digit-hyphen, so we only need to check code <= 57 to check for a digit
const code = atomData.domains[elementCount].charCodeAt(0);
const code = atomData.domains[elementCount].codePointAt(0);
if (code <= 57) {
updateResult(internals.diagnoses.rfc5321TLDNumeric);
}
Expand Down Expand Up @@ -1311,7 +1357,7 @@ exports.validate = internals.validate = function (email, options, callback) {
parseData.domain += '.';
}

const dnsDomain = parseData.domain;
const dnsDomain = Punycode.toASCII(parseData.domain);
Dns.resolveMx(dnsDomain, (err, mxRecords) => {

// If we have a fatal error, then we must assume that there are no records
Expand Down Expand Up @@ -1376,3 +1422,16 @@ exports.diagnoses = internals.validate.diagnoses = (function () {

return diag;
})();


exports.normalize = internals.normalize = function (email) {

// $lab:coverage:off$
if (process.version[1] === '4' && email.indexOf('\u0000') >= 0) {
return internals.nulNormalize(email);
}
// $lab:coverage:on$


return email.normalize('NFC');
};
4 changes: 3 additions & 1 deletion package.json
Expand Up @@ -18,10 +18,12 @@
"node": ">=4.0.0"
},
"dependencies": {
"punycode": "2.1.x"
},
"devDependencies": {
"code": "3.x.x",
"lab": "10.x.x"
"lab": "10.x.x",
"proxyquire": "1.x.x"
},
"scripts": {
"test": "lab -a code -t 100 -L -m 5000",
Expand Down

0 comments on commit 0f795c5

Please sign in to comment.