-
Notifications
You must be signed in to change notification settings - Fork 2.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
The punchline to the previous few commits - now the words from the go…
…ld tree can be used to determine whether or not to eliminate the words in the guess tree. This will make it so the test & gold trees are the same, hopefully eliminating most or all of the 'Unable to evaluate...' that happens after retagging trees with the POS tagger Also do the ChineseCollinizer and the NegraPennCollinizer. Both are tested using derivatives of the English test (using English trees, but with the tags specific for the other treebank)
- Loading branch information
1 parent
846a31b
commit e433ca8
Showing
6 changed files
with
171 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
42 changes: 42 additions & 0 deletions
42
test/src/edu/stanford/nlp/parser/lexparser/NegraPennCollinizerTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
package edu.stanford.nlp.parser.lexparser; | ||
|
||
import org.junit.Assert; | ||
import org.junit.Test; | ||
|
||
import edu.stanford.nlp.trees.Tree; | ||
|
||
public class NegraPennCollinizerTest { | ||
@Test | ||
public void testRemovePunct() { | ||
NegraPennTreebankParserParams tlpp = new NegraPennTreebankParserParams(); | ||
NegraPennCollinizer collinizer = new NegraPennCollinizer(tlpp); | ||
|
||
// Test that the collinizer removes a comma | ||
// Lazy test writing: just use the English version, updated to work with the German tags | ||
Tree gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) ($, ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))"); | ||
Tree goldT = collinizer.transformTree(gold, gold); | ||
Tree goldExpected = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))"); | ||
Assert.assertEquals(goldExpected, goldT); | ||
|
||
// Same test, but it should pick up the comma just based on the tag | ||
gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) ($, zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))"); | ||
goldT = collinizer.transformTree(gold, gold); | ||
Assert.assertEquals(goldExpected, goldT); | ||
|
||
// Difference with the English: the Negra collinizer does not look at punct words | ||
// Perhaps that was a mistake? | ||
gold = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))"); | ||
goldT = collinizer.transformTree(gold, gold); | ||
Assert.assertEquals(gold, goldT); | ||
|
||
// Double check that (CC zzzzz) is not deleted by default | ||
Tree guess = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))"); | ||
Tree guessT = collinizer.transformTree(guess, guess); | ||
Assert.assertEquals(guess, guessT); | ||
|
||
// Check that the guess tree has the non-punct word removed if it is a punct in the gold tree | ||
gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) ($, zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))"); | ||
guessT = collinizer.transformTree(guess, gold); | ||
Assert.assertEquals(goldExpected, guessT); | ||
} | ||
} |
40 changes: 40 additions & 0 deletions
40
test/src/edu/stanford/nlp/parser/lexparser/TreeCollinizerTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
package edu.stanford.nlp.parser.lexparser; | ||
|
||
import org.junit.Assert; | ||
import org.junit.Test; | ||
|
||
import edu.stanford.nlp.trees.PennTreebankLanguagePack; | ||
import edu.stanford.nlp.trees.Tree; | ||
|
||
public class TreeCollinizerTest { | ||
@Test | ||
public void testRemovePunct() { | ||
PennTreebankLanguagePack tlp = new PennTreebankLanguagePack(); | ||
TreeCollinizer collinizer = new TreeCollinizer(tlp); | ||
|
||
// Test that the collinizer removes a comma | ||
Tree gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (, ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))"); | ||
Tree goldT = collinizer.transformTree(gold, gold); | ||
Tree goldExpected = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))"); | ||
Assert.assertEquals(goldExpected, goldT); | ||
|
||
// Same test, but it should pick up the comma just based on the tag | ||
gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (, zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))"); | ||
goldT = collinizer.transformTree(gold, gold); | ||
Assert.assertEquals(goldExpected, goldT); | ||
|
||
// It should also pick up the comma based on the word | ||
gold = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))"); | ||
goldT = collinizer.transformTree(gold, gold); | ||
Assert.assertEquals(goldExpected, goldT); | ||
|
||
// Double check that (CC zzzzz) is not deleted by default | ||
Tree guess = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))"); | ||
Tree guessT = collinizer.transformTree(guess, guess); | ||
Assert.assertEquals(guess, guessT); | ||
|
||
// Check that the guess tree has the non-punct word removed if it is a punct in the gold tree | ||
guessT = collinizer.transformTree(guess, gold); | ||
Assert.assertEquals(goldExpected, guessT); | ||
} | ||
} |
41 changes: 41 additions & 0 deletions
41
test/src/edu/stanford/nlp/trees/international/pennchinese/ChineseCollinizerTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
package edu.stanford.nlp.trees.international.pennchinese; | ||
|
||
import org.junit.Assert; | ||
import org.junit.Test; | ||
|
||
import edu.stanford.nlp.trees.Tree; | ||
import edu.stanford.nlp.trees.international.pennchinese.ChineseTreebankLanguagePack; | ||
|
||
public class ChineseCollinizerTest { | ||
@Test | ||
public void testRemovePunct() { | ||
ChineseTreebankLanguagePack tlp = new ChineseTreebankLanguagePack(); | ||
ChineseCollinizer collinizer = new ChineseCollinizer(tlp); | ||
|
||
// Test that the collinizer removes a comma | ||
// Lazy test writing: just use the English version, updated to work with the Chinese tags | ||
Tree gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (PU ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))"); | ||
Tree goldT = collinizer.transformTree(gold, gold); | ||
Tree goldExpected = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))"); | ||
Assert.assertEquals(goldExpected, goldT); | ||
|
||
// Same test, but it should pick up the comma just based on the tag | ||
gold = Tree.valueOf("(ROOT (S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (PU zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie))))))"); | ||
goldT = collinizer.transformTree(gold, gold); | ||
Assert.assertEquals(goldExpected, goldT); | ||
|
||
// It should also pick up the comma based on the word | ||
gold = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC ,) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))"); | ||
goldT = collinizer.transformTree(gold, gold); | ||
Assert.assertEquals(goldExpected, goldT); | ||
|
||
// Double check that (CC zzzzz) is not deleted by default | ||
Tree guess = Tree.valueOf("(S (S (NP (PRP I)) (VP (VBP like) (NP (JJ blue) (NN skin)))) (CC zzzzz) (CC and) (S (NP (PRP I)) (VP (MD cannot) (VP (VB lie)))))"); | ||
Tree guessT = collinizer.transformTree(guess, guess); | ||
Assert.assertEquals(guess, guessT); | ||
|
||
// Check that the guess tree has the non-punct word removed if it is a punct in the gold tree | ||
guessT = collinizer.transformTree(guess, gold); | ||
Assert.assertEquals(goldExpected, guessT); | ||
} | ||
} |