Skip to content

Commit

Permalink
Add new corrections for CTB 7-9 and a unit test file.
Browse files Browse the repository at this point in the history
  • Loading branch information
manning authored and Stanford NLP committed Jul 25, 2016
1 parent 8787b2f commit 0091ed9
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 27 deletions.
Expand Up @@ -38,7 +38,7 @@ public class CTBErrorCorrectingTreeNormalizer extends BobChrisTreeNormalizer {
private static final Pattern PPTmpPattern = Pattern.compile("PP.*-TMP.*"); private static final Pattern PPTmpPattern = Pattern.compile("PP.*-TMP.*");
private static final Pattern TmpPattern = Pattern.compile(".*-TMP.*"); private static final Pattern TmpPattern = Pattern.compile(".*-TMP.*");


private static final boolean DEBUG = System.getProperty("CTBErrorCorrectingTreeNormalizer", null) != null; private static final boolean DEBUG = System.getProperty("CTBErrorCorrectingTreeNormalizer", "true") != null;


@SuppressWarnings({"NonSerializableFieldInSerializableClass"}) @SuppressWarnings({"NonSerializableFieldInSerializableClass"})
private final TreeTransformer tagExtender; private final TreeTransformer tagExtender;
Expand Down Expand Up @@ -110,6 +110,7 @@ private static class ChineseEmptyFilter implements Predicate<Tree>, Serializable
private static final long serialVersionUID = 8914098359495987617L; private static final long serialVersionUID = 8914098359495987617L;


/** Doesn't accept nodes that only cover an empty. */ /** Doesn't accept nodes that only cover an empty. */
@Override
public boolean test(Tree t) { public boolean test(Tree t) {
Tree[] kids = t.children(); Tree[] kids = t.children();
Label l = t.label(); Label l = t.label();
Expand All @@ -127,21 +128,57 @@ public boolean test(Tree t) {


} }


@SuppressWarnings({"NonSerializableFieldInSerializableClass"})
private final Predicate<Tree> chineseEmptyFilter = new ChineseEmptyFilter(); private final Predicate<Tree> chineseEmptyFilter = new ChineseEmptyFilter();


private static final TregexPattern[] splitPuncTregex = { private static final TregexPattern[] fixupTregex = {
TregexPattern.compile("PU=punc < 她{") TregexPattern.compile("PU=punc < 她{"),
TregexPattern.compile("@NP <1 (@NP <1 NR <2 (PU=bad < /^<$/)) <2 (FLR=dest <2 (NT < /English/))"),
TregexPattern.compile("@IP < (FLR=dest <: (PU < /^〈$/) $. (__=bad1 $. (PU=bad2 < /^〉$/)))"),
TregexPattern.compile("@DFL|FLR|IMG|SKIP=junk <<, (PU < /^[〈{{<\\[[]$/) <<- (PU < /^[〉}}>\\]]]$/) <3 __"),
TregexPattern.compile("WHPP=bad"),
}; };
private static final TsurgeonPattern[] splitPuncTsurgeon = { private static final TsurgeonPattern[] fixupTsurgeon = {
Tsurgeon.parseOperation("replace punc (PN 她) (PU {)") Tsurgeon.parseOperation("replace punc (PN 她) (PU {)"),
Tsurgeon.parseOperation("move bad >1 dest"),
Tsurgeon.parseOperation("[move bad1 >-1 dest] [move bad2 >-1 dest]"),
Tsurgeon.parseOperation("delete junk"),
Tsurgeon.parseOperation("relabel bad PP"),
}; };


static { static {
if (splitPuncTregex.length != splitPuncTsurgeon.length) { if (fixupTregex.length != fixupTsurgeon.length) {
throw new AssertionError("splitPuncTregex and splitPuncTsurgeon have different lengths in CTBErrorCorrectingTreeNormalizer.java"); throw new AssertionError("fixupTregex and fixupTsurgeon have different lengths in CTBErrorCorrectingTreeNormalizer.");
} }
} }


// We delete the most egregious non-speech DFL, FLR, IMG, and SKIP constituents, according to the Tregex
// expression above. Maybe more should be deleted really. I don't understand this very well, and there is no documentation.

// New phrasal categories in CTB 7 and later:
// DFL = Disfluency. Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)).
// EMO = Emoticon. For emoticons. Fine to keep.
// FLR = Filler. Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)).
// IMG = ?Image?. Appear to all be of form (IMG (PU [) (NN 图片) (PU ])). Delete all those.
// INC = Incomplete (more incomplete than a FRAG which is only syntactically incomplete). Just keep.
// INTJ = Interjection. Fine to keep.
// META = Just one of these in chtb_5200.df. Delete whole tree. Should have been turned into XML metadata
// OTH = ??. Weird but just leave.
// SKIP = ??. Always has NOI under it. Omit or keep?
// TYPO = seems like should mainly go, but sometimes a branching node??
// WHPP = ??. Just one of these. Over a -NONE- so will go if empties are deleted. But should just be PP.
//
// There is a tree in chtb_2856.bn which has IP -> ... PU (FLR (PU <)) (VV turn) (PU >)
// which just seems an error - should all be under FLR.
//
// POS tags are now 38. Original 33 plus these:
// EM = Emoticon. Often but not always under EMO.
// IC = Incomplete word rendered in pinyin, usually under DFL.
// NOI =
// URL = URL.
// X = In practice currently used only for "x" in constructions like "30 x 25 cm". Shouldn't exist!


@Override @Override
public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
Tree newTree = tree.prune(chineseEmptyFilter, tf).spliceOut(aOverAFilter); Tree newTree = tree.prune(chineseEmptyFilter, tf).spliceOut(aOverAFilter);
Expand All @@ -163,18 +200,22 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
EncodingPrintWriter.err.println("Possible error: non-unary initial rewrite: " + EncodingPrintWriter.err.println("Possible error: non-unary initial rewrite: " +
newTree.localTree(), ChineseTreebankLanguagePack.ENCODING); newTree.localTree(), ChineseTreebankLanguagePack.ENCODING);
// } // }
} else { } else if (kids.length > 0) { // ROOT has 1 child - the normal case
if (kids.length > 0) { Tree child = kids[0];
Tree child = kids[0]; if ( ! child.isPhrasal()) {
if ( ! child.isPhrasal()) { EncodingPrintWriter.err.println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.ENCODING);
EncodingPrintWriter.err.println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.ENCODING); Tree added = tf.newTreeNode("FRAG", Arrays.asList(kids));
Tree added = tf.newTreeNode("FRAG", Arrays.asList(kids)); newTree.setChild(0, added);
newTree.setChild(0, added); } else if (child.label().value().equals("META")) {
} // Delete the one bogus META tree in CTB 9
} else { EncodingPrintWriter.err.println("Deleting META tree that should be XML metadata in chtb_5200.df: " + child, ChineseTreebankLanguagePack.ENCODING);
EncodingPrintWriter.err.println("Error: tree with no children: " + tree, ChineseTreebankLanguagePack.ENCODING); return null;
} }

} else {
EncodingPrintWriter.err.println("Error: tree with no children: " + tree, ChineseTreebankLanguagePack.ENCODING);
} }

// note that there's also at least 1 tree that is an IP with no surrounding ROOT node // note that there's also at least 1 tree that is an IP with no surrounding ROOT node


// there are also several places where "NP" is used as a preterminal tag // there are also several places where "NP" is used as a preterminal tag
Expand All @@ -185,14 +226,15 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
Tree subsubtree = subtree.firstChild(); Tree subsubtree = subtree.firstChild();
if (subsubtree.value().equals("ROOT")) { if (subsubtree.value().equals("ROOT")) {
if (subsubtree.firstChild().isLeaf() && "CP".equals(subsubtree.firstChild().value())) { if (subsubtree.firstChild().isLeaf() && "CP".equals(subsubtree.firstChild().value())) {
EncodingPrintWriter.err.println("Correcting error: seriously messed up tree in CTB6: " + newTree, ChineseTreebankLanguagePack.ENCODING); EncodingPrintWriter.err.println("Correcting error: seriously messed up tree in CTB6 (chtb_3095.bn): " + newTree, ChineseTreebankLanguagePack.ENCODING);
List<Tree> children = subsubtree.getChildrenAsList(); List<Tree> children = subsubtree.getChildrenAsList();
children = children.subList(1,children.size()); children = children.subList(1,children.size());
subtree.setChildren(children); subtree.setChildren(children);
EncodingPrintWriter.err.println(" Corrected as: " + newTree, ChineseTreebankLanguagePack.ENCODING); // spaced to align with above EncodingPrintWriter.err.println(" Corrected as: " + newTree, ChineseTreebankLanguagePack.ENCODING); // spaced to align with above
} }
} }
} }
// All the stuff below here seems to have been fixed in CTB 9. Maybe reporting errors sometimes does help.
if (subtree.isPreTerminal()) { if (subtree.isPreTerminal()) {
if (subtree.value().matches("NP")) { if (subtree.value().matches("NP")) {
if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().test(subtree.firstChild().value())) { if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().test(subtree.firstChild().value())) {
Expand All @@ -214,9 +256,9 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
subtree.setValue("NN"); subtree.setValue("NN");
} }
} else if (subtree.value().matches("PU")) { } else if (subtree.value().matches("PU")) {
if (subtree.firstChild().value().matches("\u4ed6")) { if (subtree.firstChild().value().matches("")) {
if (DEBUG) { if (DEBUG) {
EncodingPrintWriter.err.println("Correcting error: \"\u4ed6\" under PU tag; tag changed to PN: " + subtree, ChineseTreebankLanguagePack.ENCODING); EncodingPrintWriter.err.println("Correcting error: \"\" under PU tag; tag changed to PN: " + subtree, ChineseTreebankLanguagePack.ENCODING);
} }
subtree.setValue("PN"); subtree.setValue("PN");
} else if (subtree.firstChild().value().equals("里")) { } else if (subtree.firstChild().value().equals("里")) {
Expand All @@ -229,7 +271,7 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
EncodingPrintWriter.err.println("Correcting error: \"" + subtree.firstChild().value() + "\" under PU tag; tag changed to VC: " + subtree, ChineseTreebankLanguagePack.ENCODING); EncodingPrintWriter.err.println("Correcting error: \"" + subtree.firstChild().value() + "\" under PU tag; tag changed to VC: " + subtree, ChineseTreebankLanguagePack.ENCODING);
} }
subtree.setValue("VC"); subtree.setValue("VC");
} else if (subtree.firstChild().value().matches("tw|\u534A\u7A74\u5F0F")) { } else if (subtree.firstChild().value().matches("tw|半穴式")) {
if (DEBUG) { if (DEBUG) {
EncodingPrintWriter.err.println("Correcting error: \"" + subtree.firstChild().value() + "\" under PU tag; tag changed to NN: " + subtree, ChineseTreebankLanguagePack.ENCODING); EncodingPrintWriter.err.println("Correcting error: \"" + subtree.firstChild().value() + "\" under PU tag; tag changed to NN: " + subtree, ChineseTreebankLanguagePack.ENCODING);
} }
Expand All @@ -254,15 +296,17 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) {
} }
} }


for (int i = 0; i < splitPuncTregex.length; ++i) { for (int i = 0; i < fixupTregex.length; ++i) {
if (DEBUG) { if (DEBUG) {
Tree preProcessed = newTree.deepCopy(); Tree preProcessed = newTree.deepCopy();
newTree = Tsurgeon.processPattern(splitPuncTregex[i], splitPuncTsurgeon[i], newTree); newTree = Tsurgeon.processPattern(fixupTregex[i], fixupTsurgeon[i], newTree);
if (!preProcessed.equals(newTree)) { if (!preProcessed.equals(newTree)) {
EncodingPrintWriter.err.println("Correcting error: Updated tree using tregex " + splitPuncTregex[i] + " and tsurgeon " + splitPuncTsurgeon[i], ChineseTreebankLanguagePack.ENCODING); EncodingPrintWriter.err.println("Correcting error: Updated tree using tregex " + fixupTregex[i] + " and tsurgeon " + fixupTsurgeon[i], ChineseTreebankLanguagePack.ENCODING);
EncodingPrintWriter.err.println(" from: " + preProcessed, ChineseTreebankLanguagePack.ENCODING);
EncodingPrintWriter.err.println(" to: " + newTree, ChineseTreebankLanguagePack.ENCODING);
} }
} else { } else {
newTree = Tsurgeon.processPattern(splitPuncTregex[i], splitPuncTsurgeon[i], newTree); newTree = Tsurgeon.processPattern(fixupTregex[i], fixupTsurgeon[i], newTree);
} }
} }


Expand Down
@@ -1,7 +1,9 @@
package edu.stanford.nlp.trees.international.pennchinese; package edu.stanford.nlp.trees.international.pennchinese;




/** A CTB TreeReaderFactory that deletes empty nodes. /** A CTB TreeReaderFactory that deletes empty nodes, and makes some corrections
* to trees while reading them in.
*
* @author Christopher Manning * @author Christopher Manning
*/ */
public class NoEmptiesCTBTreeReaderFactory extends CTBTreeReaderFactory { public class NoEmptiesCTBTreeReaderFactory extends CTBTreeReaderFactory {
Expand Down
@@ -0,0 +1,51 @@
package edu.stanford.nlp.trees.international.pennchinese;

import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeTransformer;
import junit.framework.TestCase;

/**
* @author Christopher Manning
*/
public class CTBErrorCorrectingTreeNormalizerTest extends TestCase {

public void testNormalDelete() {
String input = "(ROOT (IP (FLR (PU 〈) (VV turn) (PU 〉)) (PP (ADVP (AD 就)) (PP (P 在) (NP (CP (IP (NP (NR 北韩)) (VP (DVP (VP (VA " +
"积极)) (DEV 的)) (VP (VV 走向) (NP (NN 国际) (NN 社会))))) (DEC 的)) (NP (NN 同时))))) (PU ,) (NP (CP (IP (VP (ADVP (AD" +
" 刚刚)) (VP (VV 渡过) (NP (QP (CD 一) (CLP (M 场))) (NP (NN 大选) (NN 危机)))))) (DEC 的)) (NP (NR 南斯拉夫))) (VP (ADVP" +
" (AD 也)) (ADVP (AD 在)) (VP (VV 寻求) (NP (DNP (LCP (NP (NN 国际)) (LC 间)) (DEG 的)) (NP (NN 协助))))) (PU 。)))";
String output = "(ROOT (IP (PP (ADVP (AD 就)) (PP (P 在) (NP (CP (IP (NP (NR 北韩)) (VP (DVP (VP (VA " +
"积极)) (DEV 的)) (VP (VV 走向) (NP (NN 国际) (NN 社会))))) (DEC 的)) (NP (NN 同时))))) (PU ,) (NP (CP (IP (VP (ADVP (AD" +
" 刚刚)) (VP (VV 渡过) (NP (QP (CD 一) (CLP (M 场))) (NP (NN 大选) (NN 危机)))))) (DEC 的)) (NP (NR 南斯拉夫))) (VP (ADVP" +
" (AD 也)) (ADVP (AD 在)) (VP (VV 寻求) (NP (DNP (LCP (NP (NN 国际)) (LC 间)) (DEG 的)) (NP (NN 协助))))) (PU 。)))";
runTest(input, output);
}

public void testFixSplitElement() {
String input = "(ROOT (IP (IP (NP (NN 下面)) (VP (VV 请) (VP (VV 听) (NP (DNP (NP (NN 报道)) (DEG 的)) (ADJP (JJ 详细)) (NP (NN 内容))))" +
")) (PU :) (FLR (PU 〈)) (VV turn) (PU 〉) (IP (NP (NP (NP (NR 法国)) (NP (NN 外交) (NN 部长))) (NP (NR 韦里德纳))) " +
"(PU ,) (VP (VC 是) (NP (PP (P 自从) (LCP (IP (NP (NT 去年)) (NP (NR 北约)) (VP (VV 轰炸) (NP (NR 南斯拉夫)))) (LC 以来)" +
")) (PU ,) (QP (OD 第一) (CLP (M 位))) (CP (IP (VP (VV 访问) (NP (NP (NR 南斯拉夫)) (NP (NN 首都))))) (DEC 的)) (ADJP (JJ" +
" 主要)) (NP (NN 西方) (NN 国家) (NN 外交官))))) (PU 。)))";
String output = "(ROOT (IP (IP (NP (NN 下面)) (VP (VV 请) (VP (VV 听) (NP (DNP (NP (NN 报道)) (DEG 的)) (ADJP (JJ 详细)) (NP (NN 内容))))" +
")) (PU :) (IP (NP (NP (NP (NR 法国)) (NP (NN 外交) (NN 部长))) (NP (NR 韦里德纳))) " +
"(PU ,) (VP (VC 是) (NP (PP (P 自从) (LCP (IP (NP (NT 去年)) (NP (NR 北约)) (VP (VV 轰炸) (NP (NR 南斯拉夫)))) (LC 以来)" +
")) (PU ,) (QP (OD 第一) (CLP (M 位))) (CP (IP (VP (VV 访问) (NP (NP (NR 南斯拉夫)) (NP (NN 首都))))) (DEC 的)) (ADJP (JJ" +
" 主要)) (NP (NN 西方) (NN 国家) (NN 外交官))))) (PU 。)))";
runTest(input, output);
}

public void testAnotherSplit() {
String input = "(ROOT (IP (LCP (IP (FLR (PU <) (NR English) (PU >)) (NP (NP (NR APEC) (PU <)) (FLR (PU /) (NT English) (PU >)) (NP (NN 会议))) (VP (VV 举行))) (LC 前)) (PU ,) (NP (NP (NP (NR 日本)) (NP (NN 首相))) (NP (NR 小泉纯一郎))) (VP (VP (QP (OD 第五) (CLP (M 度))) (VP (VV 参拜) (NP (NN 靖国神社)))) (PU ,) (VP (VV 受到) (IP (NP (NP (NR 中) (NR 韩) (ETC 等)) (NP (NR 亚洲)) (NP (NN 国家))) (VP (ADJP (AD 严厉)) (VP (VV 谴责)))))) (PU 。)))";
String output = "(ROOT (IP (LCP (IP (NP (NP (NR APEC)) (NP (NN 会议))) (VP (VV 举行))) (LC 前)) (PU ,) (NP (NP (NP (NR 日本)) (NP (NN 首相))) (NP (NR 小泉纯一郎))) (VP (VP (QP (OD 第五) (CLP (M 度))) (VP (VV 参拜) (NP (NN 靖国神社)))) (PU ,) (VP (VV 受到) (IP (NP (NP (NR 中) (NR 韩) (ETC 等)) (NP (NR 亚洲)) (NP (NN 国家))) (VP (ADJP (AD 严厉)) (VP (VV 谴责)))))) (PU 。)))";
runTest(input, output);
}

private static void runTest(String input, String output) {
Tree inputTree = Tree.valueOf(input);
TreeTransformer tt = new CTBErrorCorrectingTreeNormalizer(false, false, false, false);
Tree outputTree = tt.apply(inputTree);
assertEquals(output, outputTree.toString());
}

}

0 comments on commit 0091ed9

Please sign in to comment.