Skip to content

Commit

Permalink
Merge pull request #308 from stanford-oval/wip/fix-tokenizer
Browse files Browse the repository at this point in the history
Tokenizer bug fixes
  • Loading branch information
gcampax committed Sep 4, 2020
2 parents 0c43ea9 + 4424b7e commit ea4e2c7
Show file tree
Hide file tree
Showing 6 changed files with 662 additions and 570 deletions.
2 changes: 1 addition & 1 deletion lib/i18n/american-english.js
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ class EnglishLanguagePack extends DefaultLanguagePack {
// filter out words that cannot be in the dataset,
// because they would be either tokenized/preprocessed out or
// they are unlikely to be used with voice
return /^([a-zA-Z0-9-][a-zA-Z0-9.-]*|'s|,|\?)$/.test(word);
return /^([a-zA-Z0-9-][a-zA-Z0-9.&'-]+|,|\?)$/.test(word);
}

isGoodSentence(sentence) {
Expand Down
6 changes: 4 additions & 2 deletions lib/i18n/tokenizer/base.js
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,16 @@ module.exports = class BaseTokenizer {
// XXX: we might want to extend this to all of Unicode Alphabetic characters, which includes all languages
// and then exclude ideographic characters separately

this._addDefinition('LETTER', /[a-z\u00C0-\u00D6\u00D8\u00F6\u00F8-\u01BA\u01BB\u01BC-\u01BF\u01C0-\u01C3\u01C4-\u0293\u0294\u0295\u02AF\u02EE\u0300-\u036f]/);
this._addDefinition('LETTER', /[_a-z\u00C0-\u00D6\u00D8\u00F6\u00F8-\u01BA\u01BB\u01BC-\u01BF\u01C0-\u01C3\u01C4-\u0293\u0294\u0295\u02AF\u02EE\u0300-\u036f]/);

// words
// note that we do not split hyphens ever
// hyphens are considered part of a word if at the beginning of a word or in-between two letters
// (that is, at the end of a word, or when followed by another hyphen, they would become a token by itself)
// numbers are considered part of a word if preceded by a letter
this._addDefinition('WORD', /(-{LETTER})?(?:{LETTER}[0-9]*(-{LETTER})?)+/);
// if a word ends with hyphen followed by a number, the hyphen is considered part of the word: e.g., top-50 -> top- 50
// if a word ends with hyphen, the hyphen is not considered part of the word: e.g., Twitter- -> Twitter -
this._addDefinition('WORD', /(?:-?{LETTER}[0-9]*)+(?:-?(?=[0-9]))?/);
// identifiers (tokens with at least an ASCI letter, but also - _ or a number)
this._addDefinition('IDENT', /[a-z][a-z0-9_-]+|[0-9_-]+[a-z][a-z0-9_-]+/);
}
Expand Down
Loading

0 comments on commit ea4e2c7

Please sign in to comment.