Add a fake XSL node when converting constituency trees to dependencie…

…s for SD. This makes it easy to treat 'up to' as an MWE. #1363
stanfordnlp · Jul 8, 2023 · 9a86ece · 9a86ece
1 parent 8c46648
commit 9a86ece
Show file tree

Hide file tree

Showing 6 changed files with 27 additions and 13 deletions.
diff --git a/src/edu/stanford/nlp/trees/CollinsHeadFinder.java b/src/edu/stanford/nlp/trees/CollinsHeadFinder.java
@@ -72,6 +72,8 @@ public CollinsHeadFinder(TreebankLanguagePack tlp, String... categoriesToAvoid)
     nonTerminalInfo.put("TYPO", new String[][] {{"left"}}); // another crap rule, for Brown (Roger)
     nonTerminalInfo.put("EDITED", new String[][] {{"left"}});  // crap rule for Switchboard (if don't delete EDITED nodes)
     nonTerminalInfo.put("XS", new String[][] {{"right", "IN"}}); // rule for new structure in QP
+    // XSL is similar to XS, but is specifically for left headed phrases
+    nonTerminalInfo.put("XSL", new String[][]{{"left"}});
   }
 
   @Override

diff --git a/src/edu/stanford/nlp/trees/EnglishGrammaticalRelations.java b/src/edu/stanford/nlp/trees/EnglishGrammaticalRelations.java
@@ -980,16 +980,10 @@ private EnglishGrammaticalRelations() {}
   public static final GrammaticalRelation QUANTIFIER_MODIFIER =
     new GrammaticalRelation(Language.English, "quantmod", "quantifier modifier",
         MODIFIER, "QP", tregexCompiler,
-            // RP is because sometimes "up" in "up to ___" gets tagged RP in PTB
-            // this is probably a mistake - generally it is tagged IN
-            // but sometimes the tagger follows suit
-            // there are no conflicts elsewhere in the targets of a QP,
-            // so there should be no need to specifically check for the phrase "up to" for `up_RP`
-            "QP < IN|RB|RBR|RBS|PDT|DT|JJ|JJR|JJS|XS|RP=target",
-            // TO is for the "to" in "up to ___"
-            // TODO: but currently not working for up_IN to_IN foo_CD, since it wants to make TO the head of IN!
-            "(QP < (TO=target < /^(?i:to)$/) < (__=up < /^(?i:up)$/)) : (=up $++ =target)");
-
+            // XS and XSL is to match "up to" or similar phrases
+            // after the QPTreeTransformer's operation
+            "QP < IN|RB|RBR|RBS|PDT|DT|JJ|JJR|JJS|XS|XSL|RP=target"
+        );
 
   /**
    * The "noun compound modifier" grammatical relation.  A noun compound
@@ -1275,15 +1269,17 @@ private EnglishGrammaticalRelations() {}
    */
   public static final GrammaticalRelation MULTI_WORD_EXPRESSION =
     new GrammaticalRelation(Language.English, "mwe", "multi-word expression",
-        MODIFIER, "PP|XS|ADVP|CONJP", tregexCompiler,
+        MODIFIER, "PP|XS|XSL|ADVP|CONJP", tregexCompiler,
             "PP|XS < (IN|TO < as|of|at|to|in) < (JJ|IN|JJR|JJS|NN=target < such|because|Because|least|instead|due|Due|addition|to)",
             "ADVP < (RB|IN < well) < (IN|RB|JJS=target < as)",
             // TODO: perhaps the phrase "all but" is more like "all" and should have that as the head
             "ADVP < (DT=target < all) < (CC < but)",
             "CONJP < (RB < rather|well|instead) < (RB|IN=target < as|than|of)",
             "CONJP < (IN < in) < (NN|TO=target < addition|to)",
             // todo: note inconsistent head finding for "rather than"!
-            "XS < JJR|JJS=target" // more than, fewer than, well over -- maybe change some of these?
+            "XS < JJR|JJS=target", // more than, fewer than, well over -- maybe change some of these?
+            // currently only "up to"
+            "XSL < __=target"
     );
 
   /* mihai: this block needs to be uncommented to get the KBP 2010 system to work (due to the cached sentences using old code)

diff --git a/src/edu/stanford/nlp/trees/ModCollinsHeadFinder.java b/src/edu/stanford/nlp/trees/ModCollinsHeadFinder.java
@@ -138,6 +138,8 @@ public ModCollinsHeadFinder(TreebankLanguagePack tlp) {
 
     nonTerminalInfo.put("META", new String[][] {{"left"}});  // rule for OntoNotes, but maybe should just be deleted in TreeReader??
     nonTerminalInfo.put("XS", new String[][] {{"right", "IN"}}); // rule for new structure in QP, introduced by Stanford in QPTreeTransformer
+    // XSL is similar to XS, but is specifically for left headed phrases
+    nonTerminalInfo.put("XSL", new String[][]{{"left"}});
     // nonTerminalInfo.put(null, new String[][] {{"left"}});  // rule for OntoNotes from Michel, but it would be better to fix this in TreeReader or to use a default rule?
 
     // todo: Uncomment this line if we always want to take the leftmost if no head rule is defined for the mother category.

diff --git a/src/edu/stanford/nlp/trees/QPTreeTransformer.java b/src/edu/stanford/nlp/trees/QPTreeTransformer.java
@@ -67,6 +67,14 @@ public Tree transformTree(Tree t) {
   private static TsurgeonPattern flattenNPoverQPTsurgeon =
     Tsurgeon.parseOperation("[createSubtree QP left right] [excise left left] [excise right right]");
 
+  private static TregexPattern multiwordXSLTregex =
+    // captures "up to"
+    // once "up to" is captured in the XSL, the following XS operation won't accidentally grab it
+    TregexPattern.compile("QP < ( /^RB|IN|RP/=left < /^(?:up)$/ ) < ( /^IN|TO/=right < /^(?:to)$/ $- =left )");
+
+  private static TsurgeonPattern multiwordXSLTsurgeon =
+    Tsurgeon.parseOperation("createSubtree XSL left right");
+
   private static TregexPattern multiwordXSTregex =
     // TODO: should add NN and $ to the numeric expressions captured
     //   NN is for words such as "half" which are probably misparsed
@@ -109,8 +117,10 @@ public Tree transformTree(Tree t) {
    */
   public Tree QPtransform(Tree t) {
     t = Tsurgeon.processPattern(flattenNPoverQPTregex, flattenNPoverQPTsurgeon, t);
-    if ( ! universalDependencies)
+    if (!universalDependencies) {
+      t = Tsurgeon.processPattern(multiwordXSLTregex, multiwordXSLTsurgeon, t);
       t = Tsurgeon.processPattern(multiwordXSTregex, multiwordXSTsurgeon, t);
+    }
     t = Tsurgeon.processPattern(splitCCTregex, splitCCTsurgeon, t);
     t = Tsurgeon.processPattern(splitMoneyTregex, splitMoneyTsurgeon, t);
     return t;

diff --git a/src/edu/stanford/nlp/trees/SemanticHeadFinder.java b/src/edu/stanford/nlp/trees/SemanticHeadFinder.java
@@ -199,6 +199,8 @@ private void ruleChanges() {
 
     // add the constituent XS (special node to add a layer in a QP tree introduced in our QPTreeTransformer)
     nonTerminalInfo.put("XS", new String[][]{{"right", "IN"}});
+    // XSL is similar to XS, but is specifically for left headed phrases
+    nonTerminalInfo.put("XSL", new String[][]{{"left"}});
 
     // add a rule to deal with the CoNLL data
     nonTerminalInfo.put("EMBED", new String[][]{{"right", "INTJ"}});

diff --git a/src/edu/stanford/nlp/trees/UniversalSemanticHeadFinder.java b/src/edu/stanford/nlp/trees/UniversalSemanticHeadFinder.java
@@ -181,6 +181,8 @@ private void ruleChanges() {
 
     // add the constituent XS (special node to add a layer in a QP tree introduced in our QPTreeTransformer)
     nonTerminalInfo.put("XS", new String[][]{{"right", "IN"}});
+    // XSL is similar to XS, but is specifically for left headed phrases
+    nonTerminalInfo.put("XSL", new String[][]{{"left"}});
 
     // add a rule to deal with the CoNLL data
     nonTerminalInfo.put("EMBED", new String[][]{{"right", "INTJ"}});