Merge pull request #308 from stanford-oval/wip/fix-tokenizer

Tokenizer bug fixes
stanford-oval · Sep 4, 2020 · ea4e2c7 · ea4e2c7
2 parents 0c43ea9 + 4424b7e
commit ea4e2c7
Show file tree

Hide file tree

Showing 6 changed files with 662 additions and 570 deletions.
diff --git a/lib/i18n/american-english.js b/lib/i18n/american-english.js
@@ -316,7 +316,7 @@ class EnglishLanguagePack extends DefaultLanguagePack {
         // filter out words that cannot be in the dataset,
         // because they would be either tokenized/preprocessed out or
         // they are unlikely to be used with voice
-        return /^([a-zA-Z0-9-][a-zA-Z0-9.-]*|'s|,|\?)$/.test(word);
+        return /^([a-zA-Z0-9-][a-zA-Z0-9.&'-]+|,|\?)$/.test(word);
     }
 
     isGoodSentence(sentence) {

diff --git a/lib/i18n/tokenizer/base.js b/lib/i18n/tokenizer/base.js
@@ -132,14 +132,16 @@ module.exports = class BaseTokenizer {
         // XXX: we might want to extend this to all of Unicode Alphabetic characters, which includes all languages
         // and then exclude ideographic characters separately
 
-        this._addDefinition('LETTER', /[a-z\u00C0-\u00D6\u00D8\u00F6\u00F8-\u01BA\u01BB\u01BC-\u01BF\u01C0-\u01C3\u01C4-\u0293\u0294\u0295\u02AF\u02EE\u0300-\u036f]/);
+        this._addDefinition('LETTER', /[_a-z\u00C0-\u00D6\u00D8\u00F6\u00F8-\u01BA\u01BB\u01BC-\u01BF\u01C0-\u01C3\u01C4-\u0293\u0294\u0295\u02AF\u02EE\u0300-\u036f]/);
 
         // words
         // note that we do not split hyphens ever
         // hyphens are considered part of a word if at the beginning of a word or in-between two letters
         // (that is, at the end of a word, or when followed by another hyphen, they would become a token by itself)
         // numbers are considered part of a word if preceded by a letter
-        this._addDefinition('WORD', /(-{LETTER})?(?:{LETTER}[0-9]*(-{LETTER})?)+/);
+        // if a word ends with hyphen followed by a number, the hyphen is considered part of the word: e.g., top-50 -> top- 50
+        // if a word ends with hyphen, the hyphen is not considered part of the word: e.g., Twitter- -> Twitter -
+        this._addDefinition('WORD', /(?:-?{LETTER}[0-9]*)+(?:-?(?=[0-9]))?/);
         // identifiers (tokens with at least an ASCI letter, but also - _ or a number)
         this._addDefinition('IDENT', /[a-z][a-z0-9_-]+|[0-9_-]+[a-z][a-z0-9_-]+/);
     }