Skip to content

Commit

Permalink
Make JFlex-based tokenizers share more and be more consistent.
Browse files Browse the repository at this point in the history
 - Everthing uses AbstractTokenizer.NEW_LINE
 - French and Spanish add PTBLexer enum for dashes option/treatments, and delete ptb3Dashes options
 - ellipsis and dashes style "ptb3" renamed to "ascii"
 - extract out and unify more token regex specifications in LexCommon.tokens (e.g., PHONE, EMOJI)
 - add FILENAME rule to Spanish lexer
  • Loading branch information
manning committed Jul 17, 2022
1 parent 0d9e9c8 commit 8b97d64
Show file tree
Hide file tree
Showing 12 changed files with 92,240 additions and 116,309 deletions.
568 changes: 275 additions & 293 deletions src/edu/stanford/nlp/international/french/process/FrenchLexer.flex

Large diffs are not rendered by default.

22,780 changes: 12,250 additions & 10,530 deletions src/edu/stanford/nlp/international/french/process/FrenchLexer.java

Large diffs are not rendered by default.

Expand Up @@ -63,7 +63,7 @@ public class FrenchTokenizer<T extends HasWord> extends AbstractTokenizer<T> {
private List<CoreLabel> compoundBuffer;

// Produces the tokenization for parsing used by Green, de Marneffe, and Manning (2011)
public static final String FTB_OPTIONS = "ellipses=ptb3,normalizeParentheses=true,ptb3Dashes=false," +
public static final String FTB_OPTIONS = "ellipses=ascii,normalizeParentheses=true,dashes=not_cp1252," +
SPLIT_CONTRACTIONS_OPTION + "=true," + SPLIT_COMPOUNDS_OPTION + "=true";

// Official pipeline default settings for French
Expand Down Expand Up @@ -381,7 +381,7 @@ public static void main(String[] args) {
while (tokenizer.hasNext()) {
++nTokens;
String word = tokenizer.next().word();
if (word.equals(FrenchLexer.NEWLINE_TOKEN)) {
if (word.equals(AbstractTokenizer.NEWLINE_TOKEN)) {
++nLines;
printSpace = false;
System.out.println();
Expand Down
564 changes: 267 additions & 297 deletions src/edu/stanford/nlp/international/spanish/process/SpanishLexer.flex

Large diffs are not rendered by default.

21,303 changes: 13,829 additions & 7,474 deletions src/edu/stanford/nlp/international/spanish/process/SpanishLexer.java

Large diffs are not rendered by default.

Expand Up @@ -64,9 +64,9 @@ public class SpanishTokenizer<T extends HasWord> extends AbstractTokenizer<T> {
private SpanishVerbStripper verbStripper;

// Produces the tokenization for parsing used by AnCora (fixed) */
public static final String ANCORA_OPTIONS = "ellipses=ptb3,normalizeParentheses=true,splitAll=true";
public static final String ANCORA_OPTIONS = "ellipses=ascii,normalizeParentheses=true,splitAll=true";

public static final String DEFAULT_OPTIONS = "invertible,ellipses=ptb3,splitAll=false";
public static final String DEFAULT_OPTIONS = "invertible,ellipses=ascii,splitAll=false";

/**
* Constructor.
Expand Down
53 changes: 53 additions & 0 deletions src/edu/stanford/nlp/process/LexCommon.tokens
@@ -1,3 +1,26 @@
/* Defines common token types for our JFlex-based tokenizers */

/* Todo: Really SGML shouldn't be here at all, it's kind of legacy. But we continue to tokenize
some simple standard forms of concrete SGML syntax, since it tends to give robustness. */
/* ---
( +([A-Za-z][A-Za-z0-9:.-]*( *= *['\"][^\r\n'\"]*['\"])?|['\"][^\r\n'\"]*['\"]| *\/))*
SGML = <([!?][A-Za-z-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:.-]*([ ]+([A-Za-z][A-Za-z0-9:.-]*([ ]*=[ ]*['\"][^\r\n'\"]*['\"])?|['\"][^\r\n'\"]*['\"]|[ ]*\/))*[ ]*)>
( +[A-Za-z][A-Za-z0-9:.-]*)*
FOO = ([ ]+[A-Za-z][A-Za-z0-9:.-]*)*
SGML = <([!?][A-Za-z-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:.-]* *)>
SGML = \<([!\?][A-Za-z\-][^>\r\n]*|\/?[A-Za-z][A-Za-z0-9:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*['\"][^\r\n'\"]*['\"]|['\"][^\r\n'\"]*['\"]|[ ]*\/))*[ ]*)\>
--- */

/* <STORYID cat=w pri=u> */
/* SGML1 allows attribute value match over newline; SGML2 does not. */
SGML1 = \<([!\?][A-Za-z\-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:\.\-]*([ \r\n]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ \r\n]*=[ \r\n]*('[^']*'|\"[^\"]*\"|[A-Za-z_][A-Za-z0-9_:\.\-]*)))*[ \r\n]*\/?|\/[A-Za-z][A-Za-z0-9_:\.\-]*)[ \r\n]*\>
SGML2 = \<([!\?][A-Za-z\-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:\.\-]*([ ]+([A-Za-z][A-Za-z0-9_:\.\-]*|[A-Za-z][A-Za-z0-9_:\.\-]*[ ]*=[ ]*('[^'\r\n]*'|\"[^\"\r\n]*\"|[A-Za-z_][A-Za-z0-9_:\.\-]*)))*[ ]*\/?|\/[A-Za-z][A-Za-z0-9_:\.\-]*)[ ]*\>
SPMDASH = &(MD|mdash|ndash);|[\u0096\u0097\u2013\u2014\u2015]
SPAMP = &amp;
SPPUNC = &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|#[0-9]+);
SPLET = &[aeiouAEIOU](acute|grave|uml);


/* \u3000 is ideographic space; \u205F is medium math space */
/* \u2063 is an invisible separator */
SPACE = [ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000]
Expand All @@ -12,3 +35,33 @@ NOT_SPACENL_ONE_CHAR = [^ \t\u00A0\u2000-\u200A\u202F\u205F\u2063\u3000\r\n\u202

FILENAME_EXT = 3gp|aac|aspx|avi|bat|bmp|bz2|c|class|cgi|cpp|csv|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|m4a|m4v|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|pptx|ps|psd|py|rtf|sql|tar|tgz|tif|tiff|tmp|txt|wav|wm[va]|x|xls|xlsx|xml|zip
FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}


/* Slightly generous but generally reasonably good emoji parsing. These patterns handle correctly 100% of emoji through Unicode 14.0 (Sept 2021). */
/* These are emoji that can be followed by a zwj (U+200D) and then gender or similar things (as well as skin color). Mainly humans but certain others like bears, hearts */
EMOJI_GENDERED = [\u26F9\u2764\u{01F3C3}-\u{01F3C4}\u{01F3CA}-\u{01F3CC}\u{01F408}\u{01F415}\u{01F43B}\u{01F466}-\u{01F469}\u{01F46E}-\u{01F477}\u{01F481}-\u{01F482}\u{01F486}-\u{01F487}\u{01F575}\u{01F62E}\u{1F635}\u{01F636}\u{01F645}-\u{01F647}\u{01F64B}\u{01F64D}-\u{01F64E}\u{01F6A3}\u{01F6B4}-\u{01F6B6}\u{01F926}\u{01F934}-\u{01F93E}\u{01F9B8}-\u{01F9B9}\u{01F9CD}-\u{01F9DF}\u{01FAF1}-\u{01FAF2}]
/* Emoji follow is variation selector (emoji/non-emoji rendering) or Fitzpatrick skin tone */
EMOJI_FOLLOW = [\uFE0E\uFE0F\u{01F3FB}-\u{01F3FF}]
/* Just things followed by the keycap surrounding char - note that if not separated by space beforehand, may be mistokenized */
EMOJI_KEYCAPS = [\u0023\u002A\u0030-\u0039]\uFE0F?\u20E3
/* Flags (changed to use \U to avoid bug in IntelliJ JFlex plugin).
* 1st disjunct: Two geographic characters as a flag
* 2nd disjunct: Tag digits and small letters, currently used only for GB regions flags (Scotland, Wales, England)
* 3rd disjunct: emoji tag sequence (ETS) support for certain additional flags: gay, transgender, pirate
*/
EMOJI_FLAG = [\U01F1E6-\U01F1FF]{2,2}|\U01F3F4[\u{E0030}-\u{E0039}\u{E0061}-\u{E007A}]+\U0E007F
/* Rainbow flag, transgender flag, etc. */
EMOJI_MISC = [\u{01F3F3}\u{01F3F4}\u{01F441}][\uFE0E\uFE0F]?\u200D[\u2620\u26A7\u{01F308}\u{01F5E8}][\uFE0E\uFE0F]?|{EMOJI_KEYCAPS}
/* Things that have an emoji presentation form. This is where the general single character emoji appear */
EMOJI_PRESENTATION = [\u00A9\u00AE\u203C\u2049\u2122\u2139\u2194-\u2199\u21A9-\u21AA\u231A-\u231B\u2328\u23CF\u23E9-\u23F3\u23F8-\u23FA\u24C2\u25AA-\u25AB\u25B6\u25C0\u25FB-\u27BF\u2934-\u2935\u2B05-\u2B07\u2B1B-\u2B1C\u2B50\u2B55\u3030\u303D\u3297\u3299\u{01F000}-\u{01FAFF}]
/* Emoji modifier is something that appears after a zero-width joiner (zwj) U+200D */
EMOJI_MODIFIER = [\u2640\u2642\u2695-\u2696\u2708\u2744\u2764\u2B1B\u{01F32B}\u{01F33E}\u{01F373}\u{01F37C}\u{01F384}\u{01F393}\u{01F3A4}\u{01F3A8}\u{01F3EB}\u{01F3ED}\u{01F466}-\u{01F469}\u{01F468}-\u{01F469}\u{01F48B}\u{01F4A8}\u{01F4AB}\u{01F4BB}-\u{01F4BC}\u{01F525}\u{01F527}\u{01F52C}\u{01F5E8}\u{01F680}\u{01F692}\u{01F91D}\u{01F9AF}\u{01F9B0}-\u{01F9B3}\u{01F9BA}-\u{01F9BD}\u{01F9D1}\u{01FA79}\u{01FAF2}]
/* flag | emoji optionally with follower | precomposed gendered/family consisting of human followed by one or more of zero width joiner then another human/profession | Misc */
EMOJI = {EMOJI_FLAG}|{EMOJI_PRESENTATION}{EMOJI_FOLLOW}?|{EMOJI_GENDERED}{EMOJI_FOLLOW}?(\u200D{EMOJI_MODIFIER}{EMOJI_FOLLOW}?){1,3}|{EMOJI_MISC}

/* Allow N'Ko numerals */
DIGIT = [:digit:]|[\u07C0-\u07C9]

/* phone numbers. keep multi dots pattern separate, so not confused with decimal numbers. And for new treebank tokenization 346-8792. 1st digit can't be 0 or 1 in NANP. */
/* 2022: Also allow hyphen between area code and number; allow French number like 47-42-17-11 */
PHONE = (\([0-9]{2,3}\)[- \u00A0\u2007]?|(\+\+?)?([0-9]{1,4}[- \u00A0\u2007\u2012])?[0-9]{2,4}[- \u00A0\u2007\u2012/])[0-9]{3,4}[- \u00A0\u2007\u2012]?[0-9]{3,5}|((\+\+?)?[0-9]{1,4}\.)?[0-9]{2,4}\.[0-9]{2,4}\.[0-9]{2,5}|((\+\+?)?[0-9]{1,4}-)?[0-9]{2,4}-[0-9]{2,4}-[0-9]{2,5}|[2-9][0-9]{2}[-\u2012][0-9]{4}
22 changes: 13 additions & 9 deletions src/edu/stanford/nlp/process/LexerUtils.java
Expand Up @@ -32,9 +32,9 @@ private LexerUtils() {} // static methods

public enum QuotesEnum { UNICODE, LATEX, ASCII, NOT_CP1252, ORIGINAL }

public enum EllipsesEnum { UNICODE, PTB3, NOT_CP1252, ORIGINAL }
public enum EllipsesEnum { UNICODE, ASCII, NOT_CP1252, ORIGINAL }

public enum DashesEnum { UNICODE, PTB3, NOT_CP1252, ORIGINAL }
public enum DashesEnum { UNICODE, ASCII, NOT_CP1252, ORIGINAL }


/** Change precomposed fraction characters to spelled out letter forms.
Expand Down Expand Up @@ -241,7 +241,7 @@ public static String handleEllipsis(final String tok, EllipsesEnum ellipsesStyle
switch (ellipsesStyle) {
case UNICODE:
return unicodeEllipsisStr;
case PTB3:
case ASCII:
return ptb3EllipsisStr;
case NOT_CP1252:
if (tok.equals("\u0085")) {
Expand All @@ -257,7 +257,7 @@ public static String handleEllipsis(final String tok, EllipsesEnum ellipsesStyle
}
}

// Other things to consider handling: [_\u058A\u2010\u2011\u2012]

public static String handleDashes(final String tok, DashesEnum dashesStyle) {
switch (dashesStyle) {
case UNICODE:
Expand All @@ -266,12 +266,16 @@ public static String handleDashes(final String tok, DashesEnum dashesStyle) {
} else {
return "—"; // em dash
}
case PTB3:
if ("-".equals(tok)) {
return "-"; // keep an ASCII hyphen-minus as hyphen-minus
} else {
return "--"; // two hyphen-minus ascii dashes
case ASCII:
// Map similar things to one or two ASCII hyphen-dash characters
// hyphen-dash, underscore, Armenian hyphen, hyphen, non-break hyphen, figure dash
String mid = tok.replaceAll("[-_\u058A\u2010\u2011\u2012]","-");
// cp1252 en dash, cp1252 em dash, en dash, em dash, horizontal bar
mid = mid.replaceAll("[\u0096\u0097\u2013\u2014\u2015]", "--");
if ("---".equals(mid)) {
mid = "--";
}
return mid;
case NOT_CP1252:
if (tok.equals("\u0096")) {
return "–"; // en dash
Expand Down

0 comments on commit 8b97d64

Please sign in to comment.