Skip to content

Commit

Permalink
Start refactoring a couple things which should be common to all langu…
Browse files Browse the repository at this point in the history
…age tokenizers, such as space characters and filenames
  • Loading branch information
AngledLuffa committed Apr 14, 2022
1 parent 613887a commit 3c40ba3
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 8 deletions.
8 changes: 8 additions & 0 deletions src/edu/stanford/nlp/process/LexCommon.tokens
@@ -0,0 +1,8 @@
/* \u3000 is ideographic space; \u205F is medium math space */
SPACE = [ \t\u00A0\u2000-\u200A\u202F\u20F5\u3000]
SPACES = {SPACE}+
NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
SPACENL = ({SPACE}|{NEWLINE})

FILENAME_EXT = 3gp|avi|bat|bmp|bz2|c|class|cgi|cpp|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|ps|py|sql|tar|txt|wav|x|xml|zip|wm[va]
FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}
12 changes: 5 additions & 7 deletions src/edu/stanford/nlp/process/PTBLexer.flex
Expand Up @@ -577,11 +577,9 @@ SPMDASH = &(MD|mdash|ndash);|[\u0096\u0097\u2013\u2014\u2015]
SPAMP = &
SPPUNC = &(HT|TL|UR|LR|QC|QL|QR|odq|cdq|#[0-9]+);
SPLET = &[aeiouAEIOU](acute|grave|uml);
/* \u3000 is ideographic space; \u205F is medium math space */
SPACE = [ \t\u00A0\u2000-\u200A\u202F\u20F5\u3000]
SPACES = {SPACE}+
NEWLINE = \r|\r?\n|\u2028|\u2029|\u000B|\u000C|\u0085
SPACENL = ({SPACE}|{NEWLINE})

%include LexCommon.tokens

SPACENLS = {SPACENL}+
/* These next ones are useful to get a fixed length trailing context. */
SPACENL_ONE_CHAR = [ \t\u00A0\u2000-\u200A\u202F\u3000\r\n\u2028\u2029\u000B\u000C\u0085]
Expand All @@ -608,8 +606,6 @@ DOLSIGN = ([A-Z]*\$|#)
DOLSIGN2 = [\u00A2-\u00A5\u0080\u20A0-\u20BF\u058F\u060B\u09F2\u09F3\u0AF1\u0BF9\u0E3F\u17DB\uFF04\uFFE0\uFFE1\uFFE5\uFFE6]
/* not used DOLLAR {DOLSIGN}[ \t]*{NUMBER} */
/* |\( ?{NUMBER} ?\)) # is for pound signs */
FILENAME_EXT = 3gp|avi|bat|bmp|bz2|c|class|cgi|cpp|dll|doc|docx|exe|flv|gif|gz|h|hei[cf]|htm|html|jar|java|jpeg|jpg|mov|mp[34g]|mpeg|o|pdf|php|pl|png|ppt|ps|py|sql|tar|txt|wav|x|xml|zip|wm[va]
FILENAME = [\p{Alpha}\p{Digit}]+([-~.!_/#][\p{Alpha}\p{Digit}]+)*\.{FILENAME_EXT}
/* Curse of intelligent tokenization, here we come. To model what LDC does, we separate out some \p{Digit}+\p{Alpha}+ tokens as 2 words */
/* Go with just the top 20 currencies. */
SEP_CURRENCY = (USD|EUR|JPY|GBP|AUD|CAD|CHF|CNY|SEK|NZD|MXN|SGD|HKD|NOK|KRW|TRY|RUB|INR|BRL|ZAR)
Expand Down Expand Up @@ -1118,6 +1114,8 @@ RM/{NUM} { String txt = yytext();
{ISO8601DATETIME} { return getNext(); }
//{ISO8601DATE} { return getNext(); }
{DEGREES} { return getNext(); }
/* Ideally would factor this out for use in other tokenizers,
* but the other tokenizers don't have TokenizerPerLine options */
<YyNotTokenizePerLine>{FILENAME}/({SPACENL}|[.?!,\"'<()]) { return getNext(); }
<YyTokenizePerLine>{FILENAME}/({SPACE}|[.?!,\"'<()]) { return getNext(); }
{WORD}\./{INSENTP} { String origTok = yytext();
Expand Down
8 changes: 7 additions & 1 deletion test/src/edu/stanford/nlp/process/PTBTokenizerTest.java
Expand Up @@ -96,6 +96,8 @@ public class PTBTokenizerTest {
"i got (89.2%) in my exams",
"Dial 908-333-4444 to unban mox opal",
"The jerk who banned mox opal has social security number 555-55-5555.",
"What do you suppose is in the file thicc_antennae.jpg?",
"What do you suppose is in the file thicc_antennae.asdf?",
};

private final String[][] ptbGold = {
Expand Down Expand Up @@ -188,6 +190,9 @@ public class PTBTokenizerTest {
{ "i", "got", "-LRB-", "89.2", "%", "-RRB-", "in", "my", "exams" },
{ "Dial", "908-333-4444", "to", "unban", "mox", "opal" },
{ "The", "jerk", "who", "banned", "mox", "opal", "has", "social", "security", "number", "555-55-5555", "." },
// test that filename extensions trigger something being a single word
{ "What", "do", "you", "suppose", "is", "in", "the", "file", "thicc_antennae.jpg", "?" },
{ "What", "do", "you", "suppose", "is", "in", "the", "file", "thicc_antennae", ".", "asdf", "?" },
};

private final String[][] ptbGoldSplitHyphenated = {
Expand Down Expand Up @@ -289,7 +294,8 @@ public class PTBTokenizerTest {
{ "i", "got", "(", "89.2", "%", ")", "in", "my", "exams" },
{ "Dial", "908-333-4444", "to", "unban", "mox", "opal" },
{ "The", "jerk", "who", "banned", "mox", "opal", "has", "social", "security", "number", "555-55-5555", "." },

{ "What", "do", "you", "suppose", "is", "in", "the", "file", "thicc_antennae.jpg", "?" },
{ "What", "do", "you", "suppose", "is", "in", "the", "file", "thicc_antennae", ".", "asdf", "?" },
};

@Test
Expand Down

0 comments on commit 3c40ba3

Please sign in to comment.