Make JFlex-based tokenizers share more and be more consistent.

- Everthing uses AbstractTokenizer.NEW_LINE - French and Spanish add PTBLexer enum for dashes option/treatments, and delete ptb3Dashes options - ellipsis and dashes style "ptb3" renamed to "ascii" - extract out and unify more token regex specifications in LexCommon.tokens (e.g., PHONE, EMOJI) - add FILENAME rule to Spanish lexer
stanfordnlp · Jul 17, 2022 · 8b97d64 · 8b97d64
1 parent 0d9e9c8
commit 8b97d64
Show file tree

Hide file tree

Showing 12 changed files with 92,240 additions and 116,309 deletions.
diff --git a/src/edu/stanford/nlp/international/french/process/FrenchLexer.flex b/src/edu/stanford/nlp/international/french/process/FrenchLexer.flex
diff --git a/src/edu/stanford/nlp/international/french/process/FrenchLexer.java b/src/edu/stanford/nlp/international/french/process/FrenchLexer.java
diff --git a/src/edu/stanford/nlp/international/french/process/FrenchTokenizer.java b/src/edu/stanford/nlp/international/french/process/FrenchTokenizer.java
@@ -63,7 +63,7 @@ public class FrenchTokenizer<T extends HasWord> extends AbstractTokenizer<T>  {
   private List<CoreLabel> compoundBuffer;
 
   // Produces the tokenization for parsing used by Green, de Marneffe, and Manning (2011)
-  public static final String FTB_OPTIONS = "ellipses=ptb3,normalizeParentheses=true,ptb3Dashes=false," +
+  public static final String FTB_OPTIONS = "ellipses=ascii,normalizeParentheses=true,dashes=not_cp1252," +
     SPLIT_CONTRACTIONS_OPTION + "=true," + SPLIT_COMPOUNDS_OPTION + "=true";
 
   // Official pipeline default settings for French
@@ -381,7 +381,7 @@ public static void main(String[] args) {
       while (tokenizer.hasNext()) {
         ++nTokens;
         String word = tokenizer.next().word();
-        if (word.equals(FrenchLexer.NEWLINE_TOKEN)) {
+        if (word.equals(AbstractTokenizer.NEWLINE_TOKEN)) {
           ++nLines;
           printSpace = false;
           System.out.println();

diff --git a/src/edu/stanford/nlp/international/spanish/process/SpanishLexer.flex b/src/edu/stanford/nlp/international/spanish/process/SpanishLexer.flex
diff --git a/src/edu/stanford/nlp/international/spanish/process/SpanishLexer.java b/src/edu/stanford/nlp/international/spanish/process/SpanishLexer.java
diff --git a/src/edu/stanford/nlp/international/spanish/process/SpanishTokenizer.java b/src/edu/stanford/nlp/international/spanish/process/SpanishTokenizer.java
@@ -64,9 +64,9 @@ public class SpanishTokenizer<T extends HasWord> extends AbstractTokenizer<T>  {
   private SpanishVerbStripper verbStripper;
 
   // Produces the tokenization for parsing used by AnCora (fixed) */
-  public static final String ANCORA_OPTIONS = "ellipses=ptb3,normalizeParentheses=true,splitAll=true";
+  public static final String ANCORA_OPTIONS = "ellipses=ascii,normalizeParentheses=true,splitAll=true";
 
-  public static final String DEFAULT_OPTIONS = "invertible,ellipses=ptb3,splitAll=false";
+  public static final String DEFAULT_OPTIONS = "invertible,ellipses=ascii,splitAll=false";
 
   /**
    * Constructor.

diff --git a/src/edu/stanford/nlp/process/LexCommon.tokens b/src/edu/stanford/nlp/process/LexCommon.tokens
@@ -1,3 +1,26 @@
+/* Defines common token types for our JFlex-based tokenizers */
+
+/* Todo: Really SGML shouldn't be here at all, it's kind of legacy. But we continue to tokenize
+   some simple standard forms of concrete SGML syntax, since it tends to give robustness.          */
+/* ---
+( +([A-Za-z][A-Za-z0-9:.-]*( *= *['\"][^\r\n'\"]*['\"])?|['\"][^\r\n'\"]*['\"]| *\/))*
+SGML = <([!?][A-Za-z-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:.-]*([ ]+([A-Za-z][A-Za-z0-9:.-]*([ ]*=[ ]*['\"][^\r\n'\"]*['\"])?|['\"][^\r\n'\"]*['\"]|[ ]*\/))*[ ]*)>
+( +[A-Za-z][A-Za-z0-9:.-]*)*
+FOO = ([ ]+[A-Za-z][A-Za-z0-9:.-]*)*
+SGML = <([!?][A-Za-z-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:.-]* *)>
+SGML = \<([!\?][A-Za-z\-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*['\"][^\r\n'\"]*['\"]|['\"][^\r\n'\"]*['\"]|[ ]*\/))*[ ]*)\>
+   --- */
+
+/* <STORYID cat=w pri=u> */
+/* SGML1 allows attribute value match over newline; SGML2 does not. */
+SGML1 = \<([!\?][A-Za-z\-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:\.\-]*([ \r\n]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ \r\n]*=[ \r\n]*('[^']*'|\"[^\"]*\"|[A-Za-z_][A-Za-z0-9_:\.\-]*)))*[ \r\n]*\/?|\/[A-Za-z][A-Za-z0-9_:\.\-]*)[ \r\n]*\>
+SGML2 = \<([!\?][A-Za-z\-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*('[^'\r\n]*'|\"[^\"\r\n]*\"|[A-Za-z_][A-Za-z0-9_:\.\-]*)))*[ ]*\/?|\/[A-Za-z][A-Za-z0-9_:\.\-]*)[ ]*\>
+SPMDASH = &(MD|mdash|ndash);|[\u0096\u0097\u2013\u2014\u2015]
+SPAMP = &amp;
+SPPUNC = &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|#[0-9]+);
+SPLET = &[aeiouAEIOU](acute|grave|uml);
+
+
 /* \u3000 is ideographic space; \u205F is medium math space */
 /* \u2063 is an invisible separator */
 SPACE = [ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000]
@@ -12,3 +35,33 @@ NOT_SPACENL_ONE_CHAR = [^ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000\r\n\u202
 
 FILENAME_EXT = 3gp|aac|aspx|avi|bat|bmp|bz2|c|class|cgi|cpp|csv|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|m4a|m4v|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|pptx|ps|psd|py|rtf|sql|tar|tgz|tif|tiff|tmp|txt|wav|wm[va]|x|xls|xlsx|xml|zip
 FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}
+
+
+/* Slightly generous but generally reasonably good emoji parsing. These patterns handle correctly 100% of emoji through Unicode 14.0 (Sept 2021). */
+/* These are emoji that can be followed by a zwj (U+200D) and then gender or similar things (as well as skin color). Mainly humans but certain others like bears, hearts */
+EMOJI_GENDERED = [\u26F9\u2764\u{01F3C3}-\u{01F3C4}\u{01F3CA}-\u{01F3CC}\u{01F408}\u{01F415}\u{01F43B}\u{01F466}-\u{01F469}\u{01F46E}-\u{01F477}\u{01F481}-\u{01F482}\u{01F486}-\u{01F487}\u{01F575}\u{01F62E}\u{1F635}\u{01F636}\u{01F645}-\u{01F647}\u{01F64B}\u{01F64D}-\u{01F64E}\u{01F6A3}\u{01F6B4}-\u{01F6B6}\u{01F926}\u{01F934}-\u{01F93E}\u{01F9B8}-\u{01F9B9}\u{01F9CD}-\u{01F9DF}\u{01FAF1}-\u{01FAF2}]
+/* Emoji follow is variation selector (emoji/non-emoji rendering) or Fitzpatrick skin tone */
+EMOJI_FOLLOW = [\uFE0E\uFE0F\u{01F3FB}-\u{01F3FF}]
+/* Just things followed by the keycap surrounding char - note that if not separated by space beforehand, may be mistokenized */
+EMOJI_KEYCAPS = [\u0023\u002A\u0030-\u0039]\uFE0F?\u20E3
+/* Flags (changed to use \U to avoid bug in IntelliJ JFlex plugin).
+ * 1st disjunct: Two geographic characters as a flag
+ * 2nd disjunct: Tag digits and small letters, currently used only for GB regions flags (Scotland, Wales, England)
+ * 3rd disjunct: emoji tag sequence (ETS) support for certain additional flags: gay, transgender, pirate
+ */
+EMOJI_FLAG = [\U01F1E6-\U01F1FF]{2,2}|\U01F3F4[\u{E0030}-\u{E0039}\u{E0061}-\u{E007A}]+\U0E007F
+/* Rainbow flag, transgender flag, etc. */
+EMOJI_MISC = [\u{01F3F3}\u{01F3F4}\u{01F441}][\uFE0E\uFE0F]?\u200D[\u2620\u26A7\u{01F308}\u{01F5E8}][\uFE0E\uFE0F]?|{EMOJI_KEYCAPS}
+/* Things that have an emoji presentation form. This is where the general single character emoji appear */
+EMOJI_PRESENTATION = [\u00A9\u00AE\u203C\u2049\u2122\u2139\u2194-\u2199\u21A9-\u21AA\u231A-\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA-\u25AB\u25B6\u25C0\u25FB-\u27BF\u2934-\u2935\u2B05-\u2B07\u2B1B-\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299\u{01F000}-\u{01FAFF}]
+/* Emoji modifier is something that appears after a zero-width joiner (zwj) U+200D */
+EMOJI_MODIFIER = [\u2640\u2642\u2695-\u2696\u2708\u2744\u2764\u2B1B\u{01F32B}\u{01F33E}\u{01F373}\u{01F37C}\u{01F384}\u{01F393}\u{01F3A4}\u{01F3A8}\u{01F3EB}\u{01F3ED}\u{01F466}-\u{01F469}\u{01F468}-\u{01F469}\u{01F48B}\u{01F4A8}\u{01F4AB}\u{01F4BB}-\u{01F4BC}\u{01F525}\u{01F527}\u{01F52C}\u{01F5E8}\u{01F680}\u{01F692}\u{01F91D}\u{01F9AF}\u{01F9B0}-\u{01F9B3}\u{01F9BA}-\u{01F9BD}\u{01F9D1}\u{01FA79}\u{01FAF2}]
+/* flag | emoji optionally with follower | precomposed gendered/family consisting of human followed by one or more of zero width joiner then another human/profession | Misc */
+EMOJI = {EMOJI_FLAG}|{EMOJI_PRESENTATION}{EMOJI_FOLLOW}?|{EMOJI_GENDERED}{EMOJI_FOLLOW}?(\u200D{EMOJI_MODIFIER}{EMOJI_FOLLOW}?){1,3}|{EMOJI_MISC}
+
+/* Allow N'Ko numerals */
+DIGIT = [:digit:]|[\u07C0-\u07C9]
+
+/* phone numbers. keep multi dots pattern separate, so not confused with decimal numbers. And for new treebank tokenization 346-8792. 1st digit can't be 0 or 1 in NANP. */
+/* 2022: Also allow hyphen between area code and number; allow French number like 47-42-17-11 */
+PHONE = (\([0-9]{2,3}\)[- \u00A0\u2007]?|(\+\+?)?([0-9]{1,4}[- \u00A0\u2007\u2012])?[0-9]{2,4}[- \u00A0\u2007\u2012/])[0-9]{3,4}[- \u00A0\u2007\u2012]?[0-9]{3,5}|((\+\+?)?[0-9]{1,4}\.)?[0-9]{2,4}\.[0-9]{2,4}\.[0-9]{2,5}|((\+\+?)?[0-9]{1,4}-)?[0-9]{2,4}-[0-9]{2,4}-[0-9]{2,5}|[2-9][0-9]{2}[-\u2012][0-9]{4}
diff --git a/src/edu/stanford/nlp/process/LexerUtils.java b/src/edu/stanford/nlp/process/LexerUtils.java
@@ -32,9 +32,9 @@ private LexerUtils() {} // static methods
 
   public enum QuotesEnum { UNICODE, LATEX, ASCII, NOT_CP1252, ORIGINAL }
 
-  public enum EllipsesEnum { UNICODE, PTB3, NOT_CP1252, ORIGINAL }
+  public enum EllipsesEnum { UNICODE, ASCII, NOT_CP1252, ORIGINAL }
 
-  public enum DashesEnum { UNICODE, PTB3, NOT_CP1252, ORIGINAL }
+  public enum DashesEnum { UNICODE, ASCII, NOT_CP1252, ORIGINAL }
 
 
   /** Change precomposed fraction characters to spelled out letter forms.
@@ -241,7 +241,7 @@ public static String handleEllipsis(final String tok, EllipsesEnum ellipsesStyle
     switch (ellipsesStyle) {
       case UNICODE:
         return unicodeEllipsisStr;
-      case PTB3:
+      case ASCII:
         return ptb3EllipsisStr;
       case NOT_CP1252:
         if (tok.equals("\u0085")) {
@@ -257,7 +257,7 @@ public static String handleEllipsis(final String tok, EllipsesEnum ellipsesStyle
     }
   }
 
-  // Other things to consider handling: [_\u058A\u2010\u2011\u2012]
+
   public static String handleDashes(final String tok, DashesEnum dashesStyle) {
     switch (dashesStyle) {
       case UNICODE:
@@ -266,12 +266,16 @@ public static String handleDashes(final String tok, DashesEnum dashesStyle) {
         } else {
           return "—"; // em dash
         }
-      case PTB3:
-        if ("-".equals(tok)) {
-          return "-"; // keep an ASCII hyphen-minus as hyphen-minus
-        } else {
-          return "--"; // two hyphen-minus ascii dashes
+      case ASCII:
+        // Map similar things to one or two ASCII hyphen-dash characters
+        // hyphen-dash, underscore, Armenian hyphen, hyphen, non-break hyphen, figure dash
+        String mid = tok.replaceAll("[-_\u058A\u2010\u2011\u2012]","-");
+        // cp1252 en dash, cp1252 em dash, en dash, em dash, horizontal bar
+        mid = mid.replaceAll("[\u0096\u0097\u2013\u2014\u2015]", "--");
+        if ("---".equals(mid)) {
+          mid = "--";
         }
+        return mid;
       case NOT_CP1252:
         if (tok.equals("\u0096")) {
           return "–"; // en dash