Skip to content

Commit

Permalink
Add a fake XSL node when converting constituency trees to dependencie…
Browse files Browse the repository at this point in the history
…s for SD. This makes it easy to treat 'up to' as an MWE. #1363
  • Loading branch information
AngledLuffa committed Jul 8, 2023
1 parent 8c46648 commit 9a86ece
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 13 deletions.
2 changes: 2 additions & 0 deletions src/edu/stanford/nlp/trees/CollinsHeadFinder.java
Expand Up @@ -72,6 +72,8 @@ public CollinsHeadFinder(TreebankLanguagePack tlp, String... categoriesToAvoid)
nonTerminalInfo.put("TYPO", new String[][] {{"left"}}); // another crap rule, for Brown (Roger)
nonTerminalInfo.put("EDITED", new String[][] {{"left"}}); // crap rule for Switchboard (if don't delete EDITED nodes)
nonTerminalInfo.put("XS", new String[][] {{"right", "IN"}}); // rule for new structure in QP
// XSL is similar to XS, but is specifically for left headed phrases
nonTerminalInfo.put("XSL", new String[][]{{"left"}});
}

@Override
Expand Down
20 changes: 8 additions & 12 deletions src/edu/stanford/nlp/trees/EnglishGrammaticalRelations.java
Expand Up @@ -980,16 +980,10 @@ private EnglishGrammaticalRelations() {}
public static final GrammaticalRelation QUANTIFIER_MODIFIER =
new GrammaticalRelation(Language.English, "quantmod", "quantifier modifier",
MODIFIER, "QP", tregexCompiler,
// RP is because sometimes "up" in "up to ___" gets tagged RP in PTB
// this is probably a mistake - generally it is tagged IN
// but sometimes the tagger follows suit
// there are no conflicts elsewhere in the targets of a QP,
// so there should be no need to specifically check for the phrase "up to" for `up_RP`
"QP < IN|RB|RBR|RBS|PDT|DT|JJ|JJR|JJS|XS|RP=target",
// TO is for the "to" in "up to ___"
// TODO: but currently not working for up_IN to_IN foo_CD, since it wants to make TO the head of IN!
"(QP < (TO=target < /^(?i:to)$/) < (__=up < /^(?i:up)$/)) : (=up $++ =target)");

// XS and XSL is to match "up to" or similar phrases
// after the QPTreeTransformer's operation
"QP < IN|RB|RBR|RBS|PDT|DT|JJ|JJR|JJS|XS|XSL|RP=target"
);

/**
* The "noun compound modifier" grammatical relation. A noun compound
Expand Down Expand Up @@ -1275,15 +1269,17 @@ private EnglishGrammaticalRelations() {}
*/
public static final GrammaticalRelation MULTI_WORD_EXPRESSION =
new GrammaticalRelation(Language.English, "mwe", "multi-word expression",
MODIFIER, "PP|XS|ADVP|CONJP", tregexCompiler,
MODIFIER, "PP|XS|XSL|ADVP|CONJP", tregexCompiler,
"PP|XS < (IN|TO < as|of|at|to|in) < (JJ|IN|JJR|JJS|NN=target < such|because|Because|least|instead|due|Due|addition|to)",
"ADVP < (RB|IN < well) < (IN|RB|JJS=target < as)",
// TODO: perhaps the phrase "all but" is more like "all" and should have that as the head
"ADVP < (DT=target < all) < (CC < but)",
"CONJP < (RB < rather|well|instead) < (RB|IN=target < as|than|of)",
"CONJP < (IN < in) < (NN|TO=target < addition|to)",
// todo: note inconsistent head finding for "rather than"!
"XS < JJR|JJS=target" // more than, fewer than, well over -- maybe change some of these?
"XS < JJR|JJS=target", // more than, fewer than, well over -- maybe change some of these?
// currently only "up to"
"XSL < __=target"
);

/* mihai: this block needs to be uncommented to get the KBP 2010 system to work (due to the cached sentences using old code)
Expand Down
2 changes: 2 additions & 0 deletions src/edu/stanford/nlp/trees/ModCollinsHeadFinder.java
Expand Up @@ -138,6 +138,8 @@ public ModCollinsHeadFinder(TreebankLanguagePack tlp) {

nonTerminalInfo.put("META", new String[][] {{"left"}}); // rule for OntoNotes, but maybe should just be deleted in TreeReader??
nonTerminalInfo.put("XS", new String[][] {{"right", "IN"}}); // rule for new structure in QP, introduced by Stanford in QPTreeTransformer
// XSL is similar to XS, but is specifically for left headed phrases
nonTerminalInfo.put("XSL", new String[][]{{"left"}});
// nonTerminalInfo.put(null, new String[][] {{"left"}}); // rule for OntoNotes from Michel, but it would be better to fix this in TreeReader or to use a default rule?

// todo: Uncomment this line if we always want to take the leftmost if no head rule is defined for the mother category.
Expand Down
12 changes: 11 additions & 1 deletion src/edu/stanford/nlp/trees/QPTreeTransformer.java
Expand Up @@ -67,6 +67,14 @@ public Tree transformTree(Tree t) {
private static TsurgeonPattern flattenNPoverQPTsurgeon =
Tsurgeon.parseOperation("[createSubtree QP left right] [excise left left] [excise right right]");

private static TregexPattern multiwordXSLTregex =
// captures "up to"
// once "up to" is captured in the XSL, the following XS operation won't accidentally grab it
TregexPattern.compile("QP < ( /^RB|IN|RP/=left < /^(?:up)$/ ) < ( /^IN|TO/=right < /^(?:to)$/ $- =left )");

private static TsurgeonPattern multiwordXSLTsurgeon =
Tsurgeon.parseOperation("createSubtree XSL left right");

private static TregexPattern multiwordXSTregex =
// TODO: should add NN and $ to the numeric expressions captured
// NN is for words such as "half" which are probably misparsed
Expand Down Expand Up @@ -109,8 +117,10 @@ public Tree transformTree(Tree t) {
*/
public Tree QPtransform(Tree t) {
t = Tsurgeon.processPattern(flattenNPoverQPTregex, flattenNPoverQPTsurgeon, t);
if ( ! universalDependencies)
if (!universalDependencies) {
t = Tsurgeon.processPattern(multiwordXSLTregex, multiwordXSLTsurgeon, t);
t = Tsurgeon.processPattern(multiwordXSTregex, multiwordXSTsurgeon, t);
}
t = Tsurgeon.processPattern(splitCCTregex, splitCCTsurgeon, t);
t = Tsurgeon.processPattern(splitMoneyTregex, splitMoneyTsurgeon, t);
return t;
Expand Down
2 changes: 2 additions & 0 deletions src/edu/stanford/nlp/trees/SemanticHeadFinder.java
Expand Up @@ -199,6 +199,8 @@ private void ruleChanges() {

// add the constituent XS (special node to add a layer in a QP tree introduced in our QPTreeTransformer)
nonTerminalInfo.put("XS", new String[][]{{"right", "IN"}});
// XSL is similar to XS, but is specifically for left headed phrases
nonTerminalInfo.put("XSL", new String[][]{{"left"}});

// add a rule to deal with the CoNLL data
nonTerminalInfo.put("EMBED", new String[][]{{"right", "INTJ"}});
Expand Down
2 changes: 2 additions & 0 deletions src/edu/stanford/nlp/trees/UniversalSemanticHeadFinder.java
Expand Up @@ -181,6 +181,8 @@ private void ruleChanges() {

// add the constituent XS (special node to add a layer in a QP tree introduced in our QPTreeTransformer)
nonTerminalInfo.put("XS", new String[][]{{"right", "IN"}});
// XSL is similar to XS, but is specifically for left headed phrases
nonTerminalInfo.put("XSL", new String[][]{{"left"}});

// add a rule to deal with the CoNLL data
nonTerminalInfo.put("EMBED", new String[][]{{"right", "INTJ"}});
Expand Down

0 comments on commit 9a86ece

Please sign in to comment.