The punchline to the previous few commits - now the words from the go…

…ld tree can be used to determine whether or not to eliminate the words in the guess tree. This will make it so the test & gold trees are the same, hopefully eliminating most or all of the 'Unable to evaluate...' that happens after retagging trees with the POS tagger Also do the ChineseCollinizer and the NegraPennCollinizer. Both are tested using derivatives of the English test (using English trees, but with the tags specific for the other treebank)
stanfordnlp · Feb 24, 2023 · e433ca8 · e433ca8
1 parent 846a31b
commit e433ca8
Show file tree

Hide file tree

Showing 6 changed files with 171 additions and 32 deletions.
diff --git a/src/edu/stanford/nlp/parser/lexparser/NegraPennCollinizer.java b/src/edu/stanford/nlp/parser/lexparser/NegraPennCollinizer.java
@@ -48,12 +48,13 @@ private Tree transformTree(Tree guess, Iterator<Tree> goldPreterminals) {
     }
     String s = l.value();
     s = tlpp.treebankLanguagePack().basicCategory(s);
-    if (deletePunct) {
-      // this is broken as it's not the right thing to do when there
-      // is any tag ambiguity -- and there is for ' (POS/'').  Sentences
-      // can then have more or less words.  It's also unnecessary for EVALB,
-      // since it ignores punctuation anyway
-      if (guess.isPreTerminal() && tlpp.treebankLanguagePack().isEvalBIgnoredPunctuationTag(s)) {
+    if (deletePunct && guess.isPreTerminal()) {
+      // Eliminate unwanted (in terms of evaluation) punctuation
+      // by comparing the gold punctuation, not the guess tree
+      // This way, retagging does not change the results
+      Tree goldPT = goldPreterminals.next();
+      String goldTag = tlpp.treebankLanguagePack().basicCategory(goldPT.value());
+      if (tlpp.treebankLanguagePack().isEvalBIgnoredPunctuationTag(goldTag)) {
         return null;
       }
     }

diff --git a/src/edu/stanford/nlp/parser/lexparser/TreeCollinizer.java b/src/edu/stanford/nlp/parser/lexparser/TreeCollinizer.java
@@ -59,17 +59,7 @@ public Tree transformTree(Tree guess, Tree gold) {
     return transformTree(guess, Trees.preTerminals(gold).iterator());
   }
 
-  private Tree transformTree(Tree guess, Iterator<Tree> goldPreterminals) {
-    if (guess == null) return null;
-    TreeFactory tf = guess.treeFactory();
-
-    String s = guess.value();
-    if (tlp.isStartSymbol(s))
-      return transformTree(guess.firstChild(), goldPreterminals);
-
-    if (guess.isLeaf()) {
-      return tf.newLeaf(guess.label());
-    }
+  private String simplifyCategory(String s) {
     s = tlp.basicCategory(s);
     if (((whOption & 1) != 0) && s.startsWith("WH")) {
       s = s.substring(2);
@@ -82,14 +72,35 @@ private Tree transformTree(Tree guess, Iterator<Tree> goldPreterminals) {
     if (((whOption & 4) != 0) && s.startsWith("WH")) {
       s = s.substring(2);
     }
+    return s;
+  }
 
-    // wsg2010: Might need a better way to deal with tag ambiguity. This still doesn't handle the
-    // case where the GOLD tree does not label a punctuation mark as such (common in French), and
-    // the guess tree does.
-    if (deletePunct && guess.isPreTerminal() &&
-        (tlp.isEvalBIgnoredPunctuationTag(s) ||
-         tlp.isPunctuationWord(guess.firstChild().value()))) {
-      return null;
+  private Tree transformTree(Tree guess, Iterator<Tree> goldPreterminals) {
+    if (guess == null) return null;
+    TreeFactory tf = guess.treeFactory();
+
+    String s = guess.value();
+    if (tlp.isStartSymbol(s))
+      return transformTree(guess.firstChild(), goldPreterminals);
+
+    if (guess.isLeaf()) {
+      return tf.newLeaf(guess.label());
+    }
+    s = simplifyCategory(s);
+
+    // Using the gold tag (and gold word, just in case things are
+    // really weird) avoids a problem where the tagger might have used
+    // a punct tag when the gold tag is not punct, or vice versa.
+    // Otherwise, the transformed trees will be of different length,
+    // which makes scoring difficult if not impossible
+    if (deletePunct && guess.isPreTerminal()) {
+      Tree goldPT = goldPreterminals.next();
+      String goldCategory = goldPT.value();
+      goldCategory = simplifyCategory(goldCategory);
+      if (tlp.isEvalBIgnoredPunctuationTag(goldCategory) ||
+          tlp.isPunctuationWord(goldPT.firstChild().value())) {
+        return null;
+      }
     }
 
     // remove the extra NPs inserted in the collinsBaseNP option

diff --git a/src/edu/stanford/nlp/trees/international/pennchinese/ChineseCollinizer.java b/src/edu/stanford/nlp/trees/international/pennchinese/ChineseCollinizer.java
@@ -66,18 +66,22 @@ private Tree transformTree(Tree guess, Iterator<Tree> goldPreterminals, boolean
 
     // log.info("ChineseCollinizer: Node label is " + label);
 
-    // TODO: use the gold tree to delete the same punct from both trees
-    if (guess.isLeaf()) {
-      if (deletePunct && ctlp.isPunctuationWord(label)) {
+    // Eliminate unwanted (in terms of evaluation) punctuation
+    // by comparing the gold punctuation, not the guess tree
+    // This way, retagging does not change the results
+    if (guess.isPreTerminal() && deletePunct) {
+      Tree goldPT = goldPreterminals.next();
+      if (ctlp.isPunctuationTag(goldPT.label().value()) ||
+          ctlp.isPunctuationWord(goldPT.firstChild().label().value())) {
+        // System.out.println("Deleting punctuation");
         return null;
-      } else {
-        return tf.newLeaf(new StringLabel(label));
       }
     }
-    if (guess.isPreTerminal() && deletePunct && ctlp.isPunctuationTag(label)) {
-      // System.out.println("Deleting punctuation");
-      return null;
+
+    if (guess.isLeaf()) {
+      return tf.newLeaf(new StringLabel(label));
     }
+
     List<Tree> children = new ArrayList<>();
 
     if (label.matches("ROOT.*") && guess.numChildren() == 1) { // keep non-unary roots for now

diff --git a/test/src/edu/stanford/nlp/parser/lexparser/NegraPennCollinizerTest.java b/test/src/edu/stanford/nlp/parser/lexparser/NegraPennCollinizerTest.java
@@ -0,0 +1,42 @@
+package edu.stanford.nlp.parser.lexparser;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import edu.stanford.nlp.trees.Tree;
+
+public class NegraPennCollinizerTest {
+  @Test
+  public void testRemovePunct() {
+    NegraPennTreebankParserParams tlpp = new NegraPennTreebankParserParams();
+    NegraPennCollinizer collinizer = new NegraPennCollinizer(tlpp);
+
+    // Test that the collinizer removes a comma
+    // Lazy test writing: just use the English version, updated to work with the German tags
+    Tree gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) ($, ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
+    Tree goldT = collinizer.transformTree(gold, gold);
+    Tree goldExpected = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
+    Assert.assertEquals(goldExpected, goldT);
+
+    // Same test, but it should pick up the comma just based on the tag
+    gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) ($, zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
+    goldT = collinizer.transformTree(gold, gold);
+    Assert.assertEquals(goldExpected, goldT);
+
+    // Difference with the English: the Negra collinizer does not look at punct words
+    // Perhaps that was a mistake?
+    gold = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
+    goldT = collinizer.transformTree(gold, gold);
+    Assert.assertEquals(gold, goldT);
+
+    // Double check that (CC zzzzz) is not deleted by default
+    Tree guess = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
+    Tree guessT = collinizer.transformTree(guess, guess);
+    Assert.assertEquals(guess, guessT);
+
+    // Check that the guess tree has the non-punct word removed if it is a punct in the gold tree
+    gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) ($, zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
+    guessT = collinizer.transformTree(guess, gold);
+    Assert.assertEquals(goldExpected, guessT);
+  }
+}
diff --git a/test/src/edu/stanford/nlp/parser/lexparser/TreeCollinizerTest.java b/test/src/edu/stanford/nlp/parser/lexparser/TreeCollinizerTest.java
@@ -0,0 +1,40 @@
+package edu.stanford.nlp.parser.lexparser;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import edu.stanford.nlp.trees.PennTreebankLanguagePack;
+import edu.stanford.nlp.trees.Tree;
+
+public class TreeCollinizerTest {
+  @Test
+  public void testRemovePunct() {
+    PennTreebankLanguagePack tlp = new PennTreebankLanguagePack();
+    TreeCollinizer collinizer = new TreeCollinizer(tlp);
+
+    // Test that the collinizer removes a comma
+    Tree gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (, ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
+    Tree goldT = collinizer.transformTree(gold, gold);
+    Tree goldExpected = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
+    Assert.assertEquals(goldExpected, goldT);
+
+    // Same test, but it should pick up the comma just based on the tag
+    gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (, zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
+    goldT = collinizer.transformTree(gold, gold);
+    Assert.assertEquals(goldExpected, goldT);
+
+    // It should also pick up the comma based on the word
+    gold = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
+    goldT = collinizer.transformTree(gold, gold);
+    Assert.assertEquals(goldExpected, goldT);
+
+    // Double check that (CC zzzzz) is not deleted by default
+    Tree guess = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
+    Tree guessT = collinizer.transformTree(guess, guess);
+    Assert.assertEquals(guess, guessT);
+
+    // Check that the guess tree has the non-punct word removed if it is a punct in the gold tree
+    guessT = collinizer.transformTree(guess, gold);
+    Assert.assertEquals(goldExpected, guessT);
+  }
+}
diff --git a/test/src/edu/stanford/nlp/trees/international/pennchinese/ChineseCollinizerTest.java b/test/src/edu/stanford/nlp/trees/international/pennchinese/ChineseCollinizerTest.java
@@ -0,0 +1,41 @@
+package edu.stanford.nlp.trees.international.pennchinese;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.international.pennchinese.ChineseTreebankLanguagePack;
+
+public class ChineseCollinizerTest {
+  @Test
+  public void testRemovePunct() {
+    ChineseTreebankLanguagePack tlp = new ChineseTreebankLanguagePack();
+    ChineseCollinizer collinizer = new ChineseCollinizer(tlp);
+
+    // Test that the collinizer removes a comma
+    // Lazy test writing: just use the English version, updated to work with the Chinese tags
+    Tree gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (PU ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
+    Tree goldT = collinizer.transformTree(gold, gold);
+    Tree goldExpected = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
+    Assert.assertEquals(goldExpected, goldT);
+
+    // Same test, but it should pick up the comma just based on the tag
+    gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (PU zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
+    goldT = collinizer.transformTree(gold, gold);
+    Assert.assertEquals(goldExpected, goldT);
+
+    // It should also pick up the comma based on the word
+    gold = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
+    goldT = collinizer.transformTree(gold, gold);
+    Assert.assertEquals(goldExpected, goldT);
+
+    // Double check that (CC zzzzz) is not deleted by default
+    Tree guess = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
+    Tree guessT = collinizer.transformTree(guess, guess);
+    Assert.assertEquals(guess, guessT);
+
+    // Check that the guess tree has the non-punct word removed if it is a punct in the gold tree
+    guessT = collinizer.transformTree(guess, gold);
+    Assert.assertEquals(goldExpected, guessT);
+  }
+}