Skip to content

Commit

Permalink
The punchline to the previous few commits - now the words from the go…
Browse files Browse the repository at this point in the history
…ld tree can be used to determine whether or not to eliminate the words in the guess tree. This will make it so the test & gold trees are the same, hopefully eliminating most or all of the 'Unable to evaluate...' that happens after retagging trees with the POS tagger

Also do the ChineseCollinizer and the NegraPennCollinizer.
Both are tested using derivatives of the English test
(using English trees, but with the tags specific for the other treebank)
  • Loading branch information
AngledLuffa committed Feb 24, 2023
1 parent 846a31b commit e433ca8
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 32 deletions.
13 changes: 7 additions & 6 deletions src/edu/stanford/nlp/parser/lexparser/NegraPennCollinizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,13 @@ private Tree transformTree(Tree guess, Iterator<Tree> goldPreterminals) {
}
String s = l.value();
s = tlpp.treebankLanguagePack().basicCategory(s);
if (deletePunct) {
// this is broken as it's not the right thing to do when there
// is any tag ambiguity -- and there is for ' (POS/''). Sentences
// can then have more or less words. It's also unnecessary for EVALB,
// since it ignores punctuation anyway
if (guess.isPreTerminal() && tlpp.treebankLanguagePack().isEvalBIgnoredPunctuationTag(s)) {
if (deletePunct && guess.isPreTerminal()) {
// Eliminate unwanted (in terms of evaluation) punctuation
// by comparing the gold punctuation, not the guess tree
// This way, retagging does not change the results
Tree goldPT = goldPreterminals.next();
String goldTag = tlpp.treebankLanguagePack().basicCategory(goldPT.value());
if (tlpp.treebankLanguagePack().isEvalBIgnoredPunctuationTag(goldTag)) {
return null;
}
}
Expand Down
47 changes: 29 additions & 18 deletions src/edu/stanford/nlp/parser/lexparser/TreeCollinizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,17 +59,7 @@ public Tree transformTree(Tree guess, Tree gold) {
return transformTree(guess, Trees.preTerminals(gold).iterator());
}

private Tree transformTree(Tree guess, Iterator<Tree> goldPreterminals) {
if (guess == null) return null;
TreeFactory tf = guess.treeFactory();

String s = guess.value();
if (tlp.isStartSymbol(s))
return transformTree(guess.firstChild(), goldPreterminals);

if (guess.isLeaf()) {
return tf.newLeaf(guess.label());
}
private String simplifyCategory(String s) {
s = tlp.basicCategory(s);
if (((whOption & 1) != 0) && s.startsWith("WH")) {
s = s.substring(2);
Expand All @@ -82,14 +72,35 @@ private Tree transformTree(Tree guess, Iterator<Tree> goldPreterminals) {
if (((whOption & 4) != 0) && s.startsWith("WH")) {
s = s.substring(2);
}
return s;
}

// wsg2010: Might need a better way to deal with tag ambiguity. This still doesn't handle the
// case where the GOLD tree does not label a punctuation mark as such (common in French), and
// the guess tree does.
if (deletePunct && guess.isPreTerminal() &&
(tlp.isEvalBIgnoredPunctuationTag(s) ||
tlp.isPunctuationWord(guess.firstChild().value()))) {
return null;
private Tree transformTree(Tree guess, Iterator<Tree> goldPreterminals) {
if (guess == null) return null;
TreeFactory tf = guess.treeFactory();

String s = guess.value();
if (tlp.isStartSymbol(s))
return transformTree(guess.firstChild(), goldPreterminals);

if (guess.isLeaf()) {
return tf.newLeaf(guess.label());
}
s = simplifyCategory(s);

// Using the gold tag (and gold word, just in case things are
// really weird) avoids a problem where the tagger might have used
// a punct tag when the gold tag is not punct, or vice versa.
// Otherwise, the transformed trees will be of different length,
// which makes scoring difficult if not impossible
if (deletePunct && guess.isPreTerminal()) {
Tree goldPT = goldPreterminals.next();
String goldCategory = goldPT.value();
goldCategory = simplifyCategory(goldCategory);
if (tlp.isEvalBIgnoredPunctuationTag(goldCategory) ||
tlp.isPunctuationWord(goldPT.firstChild().value())) {
return null;
}
}

// remove the extra NPs inserted in the collinsBaseNP option
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,18 +66,22 @@ private Tree transformTree(Tree guess, Iterator<Tree> goldPreterminals, boolean

// log.info("ChineseCollinizer: Node label is " + label);

// TODO: use the gold tree to delete the same punct from both trees
if (guess.isLeaf()) {
if (deletePunct && ctlp.isPunctuationWord(label)) {
// Eliminate unwanted (in terms of evaluation) punctuation
// by comparing the gold punctuation, not the guess tree
// This way, retagging does not change the results
if (guess.isPreTerminal() && deletePunct) {
Tree goldPT = goldPreterminals.next();
if (ctlp.isPunctuationTag(goldPT.label().value()) ||
ctlp.isPunctuationWord(goldPT.firstChild().label().value())) {
// System.out.println("Deleting punctuation");
return null;
} else {
return tf.newLeaf(new StringLabel(label));
}
}
if (guess.isPreTerminal() && deletePunct && ctlp.isPunctuationTag(label)) {
// System.out.println("Deleting punctuation");
return null;

if (guess.isLeaf()) {
return tf.newLeaf(new StringLabel(label));
}

List<Tree> children = new ArrayList<>();

if (label.matches("ROOT.*") && guess.numChildren() == 1) { // keep non-unary roots for now
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package edu.stanford.nlp.parser.lexparser;

import org.junit.Assert;
import org.junit.Test;

import edu.stanford.nlp.trees.Tree;

public class NegraPennCollinizerTest {
@Test
public void testRemovePunct() {
NegraPennTreebankParserParams tlpp = new NegraPennTreebankParserParams();
NegraPennCollinizer collinizer = new NegraPennCollinizer(tlpp);

// Test that the collinizer removes a comma
// Lazy test writing: just use the English version, updated to work with the German tags
Tree gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) ($, ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
Tree goldT = collinizer.transformTree(gold, gold);
Tree goldExpected = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
Assert.assertEquals(goldExpected, goldT);

// Same test, but it should pick up the comma just based on the tag
gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) ($, zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
goldT = collinizer.transformTree(gold, gold);
Assert.assertEquals(goldExpected, goldT);

// Difference with the English: the Negra collinizer does not look at punct words
// Perhaps that was a mistake?
gold = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
goldT = collinizer.transformTree(gold, gold);
Assert.assertEquals(gold, goldT);

// Double check that (CC zzzzz) is not deleted by default
Tree guess = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
Tree guessT = collinizer.transformTree(guess, guess);
Assert.assertEquals(guess, guessT);

// Check that the guess tree has the non-punct word removed if it is a punct in the gold tree
gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) ($, zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
guessT = collinizer.transformTree(guess, gold);
Assert.assertEquals(goldExpected, guessT);
}
}
40 changes: 40 additions & 0 deletions test/src/edu/stanford/nlp/parser/lexparser/TreeCollinizerTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package edu.stanford.nlp.parser.lexparser;

import org.junit.Assert;
import org.junit.Test;

import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;

public class TreeCollinizerTest {
@Test
public void testRemovePunct() {
PennTreebankLanguagePack tlp = new PennTreebankLanguagePack();
TreeCollinizer collinizer = new TreeCollinizer(tlp);

// Test that the collinizer removes a comma
Tree gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (, ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
Tree goldT = collinizer.transformTree(gold, gold);
Tree goldExpected = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
Assert.assertEquals(goldExpected, goldT);

// Same test, but it should pick up the comma just based on the tag
gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (, zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
goldT = collinizer.transformTree(gold, gold);
Assert.assertEquals(goldExpected, goldT);

// It should also pick up the comma based on the word
gold = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
goldT = collinizer.transformTree(gold, gold);
Assert.assertEquals(goldExpected, goldT);

// Double check that (CC zzzzz) is not deleted by default
Tree guess = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
Tree guessT = collinizer.transformTree(guess, guess);
Assert.assertEquals(guess, guessT);

// Check that the guess tree has the non-punct word removed if it is a punct in the gold tree
guessT = collinizer.transformTree(guess, gold);
Assert.assertEquals(goldExpected, guessT);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package edu.stanford.nlp.trees.international.pennchinese;

import org.junit.Assert;
import org.junit.Test;

import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.international.pennchinese.ChineseTreebankLanguagePack;

public class ChineseCollinizerTest {
@Test
public void testRemovePunct() {
ChineseTreebankLanguagePack tlp = new ChineseTreebankLanguagePack();
ChineseCollinizer collinizer = new ChineseCollinizer(tlp);

// Test that the collinizer removes a comma
// Lazy test writing: just use the English version, updated to work with the Chinese tags
Tree gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (PU ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
Tree goldT = collinizer.transformTree(gold, gold);
Tree goldExpected = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
Assert.assertEquals(goldExpected, goldT);

// Same test, but it should pick up the comma just based on the tag
gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (PU zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))");
goldT = collinizer.transformTree(gold, gold);
Assert.assertEquals(goldExpected, goldT);

// It should also pick up the comma based on the word
gold = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
goldT = collinizer.transformTree(gold, gold);
Assert.assertEquals(goldExpected, goldT);

// Double check that (CC zzzzz) is not deleted by default
Tree guess = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))");
Tree guessT = collinizer.transformTree(guess, guess);
Assert.assertEquals(guess, guessT);

// Check that the guess tree has the non-punct word removed if it is a punct in the gold tree
guessT = collinizer.transformTree(guess, gold);
Assert.assertEquals(goldExpected, guessT);
}
}

0 comments on commit e433ca8

Please sign in to comment.