diff --git a/src/edu/stanford/nlp/trees/international/pennchinese/CTBErrorCorrectingTreeNormalizer.java b/src/edu/stanford/nlp/trees/international/pennchinese/CTBErrorCorrectingTreeNormalizer.java index b53042a9f5..8f85e01f98 100644 --- a/src/edu/stanford/nlp/trees/international/pennchinese/CTBErrorCorrectingTreeNormalizer.java +++ b/src/edu/stanford/nlp/trees/international/pennchinese/CTBErrorCorrectingTreeNormalizer.java @@ -38,7 +38,7 @@ public class CTBErrorCorrectingTreeNormalizer extends BobChrisTreeNormalizer { private static final Pattern PPTmpPattern = Pattern.compile("PP.*-TMP.*"); private static final Pattern TmpPattern = Pattern.compile(".*-TMP.*"); - private static final boolean DEBUG = System.getProperty("CTBErrorCorrectingTreeNormalizer", null) != null; + private static final boolean DEBUG = System.getProperty("CTBErrorCorrectingTreeNormalizer", "true") != null; @SuppressWarnings({"NonSerializableFieldInSerializableClass"}) private final TreeTransformer tagExtender; @@ -110,6 +110,7 @@ private static class ChineseEmptyFilter implements Predicate, Serializable private static final long serialVersionUID = 8914098359495987617L; /** Doesn't accept nodes that only cover an empty. */ + @Override public boolean test(Tree t) { Tree[] kids = t.children(); Label l = t.label(); @@ -127,21 +128,57 @@ public boolean test(Tree t) { } + @SuppressWarnings({"NonSerializableFieldInSerializableClass"}) private final Predicate chineseEmptyFilter = new ChineseEmptyFilter(); - private static final TregexPattern[] splitPuncTregex = { - TregexPattern.compile("PU=punc < 她{") + private static final TregexPattern[] fixupTregex = { + TregexPattern.compile("PU=punc < 她{"), + TregexPattern.compile("@NP <1 (@NP <1 NR <2 (PU=bad < /^<$/)) <2 (FLR=dest <2 (NT < /English/))"), + TregexPattern.compile("@IP < (FLR=dest <: (PU < /^〈$/) $. (__=bad1 $. (PU=bad2 < /^〉$/)))"), + TregexPattern.compile("@DFL|FLR|IMG|SKIP=junk <<, (PU < /^[〈{{<\\[[]$/) <<- (PU < /^[〉}}>\\]]]$/) <3 __"), + TregexPattern.compile("WHPP=bad"), }; - private static final TsurgeonPattern[] splitPuncTsurgeon = { - Tsurgeon.parseOperation("replace punc (PN 她) (PU {)") + private static final TsurgeonPattern[] fixupTsurgeon = { + Tsurgeon.parseOperation("replace punc (PN 她) (PU {)"), + Tsurgeon.parseOperation("move bad >1 dest"), + Tsurgeon.parseOperation("[move bad1 >-1 dest] [move bad2 >-1 dest]"), + Tsurgeon.parseOperation("delete junk"), + Tsurgeon.parseOperation("relabel bad PP"), }; static { - if (splitPuncTregex.length != splitPuncTsurgeon.length) { - throw new AssertionError("splitPuncTregex and splitPuncTsurgeon have different lengths in CTBErrorCorrectingTreeNormalizer.java"); + if (fixupTregex.length != fixupTsurgeon.length) { + throw new AssertionError("fixupTregex and fixupTsurgeon have different lengths in CTBErrorCorrectingTreeNormalizer."); } } + // We delete the most egregious non-speech DFL, FLR, IMG, and SKIP constituents, according to the Tregex + // expression above. Maybe more should be deleted really. I don't understand this very well, and there is no documentation. + + // New phrasal categories in CTB 7 and later: + // DFL = Disfluency. Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)). + // EMO = Emoticon. For emoticons. Fine to keep. + // FLR = Filler. Generally keep but delete for ones that are things like (FLR (PU <) (VV turn) (PU >)). + // IMG = ?Image?. Appear to all be of form (IMG (PU [) (NN 图片) (PU ])). Delete all those. + // INC = Incomplete (more incomplete than a FRAG which is only syntactically incomplete). Just keep. + // INTJ = Interjection. Fine to keep. + // META = Just one of these in chtb_5200.df. Delete whole tree. Should have been turned into XML metadata + // OTH = ??. Weird but just leave. + // SKIP = ??. Always has NOI under it. Omit or keep? + // TYPO = seems like should mainly go, but sometimes a branching node?? + // WHPP = ??. Just one of these. Over a -NONE- so will go if empties are deleted. But should just be PP. + // + // There is a tree in chtb_2856.bn which has IP -> ... PU (FLR (PU <)) (VV turn) (PU >) + // which just seems an error - should all be under FLR. + // + // POS tags are now 38. Original 33 plus these: + // EM = Emoticon. Often but not always under EMO. + // IC = Incomplete word rendered in pinyin, usually under DFL. + // NOI = + // URL = URL. + // X = In practice currently used only for "x" in constructions like "30 x 25 cm". Shouldn't exist! + + @Override public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { Tree newTree = tree.prune(chineseEmptyFilter, tf).spliceOut(aOverAFilter); @@ -163,18 +200,22 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { EncodingPrintWriter.err.println("Possible error: non-unary initial rewrite: " + newTree.localTree(), ChineseTreebankLanguagePack.ENCODING); // } - } else { - if (kids.length > 0) { - Tree child = kids[0]; - if ( ! child.isPhrasal()) { - EncodingPrintWriter.err.println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.ENCODING); - Tree added = tf.newTreeNode("FRAG", Arrays.asList(kids)); - newTree.setChild(0, added); - } - } else { - EncodingPrintWriter.err.println("Error: tree with no children: " + tree, ChineseTreebankLanguagePack.ENCODING); + } else if (kids.length > 0) { // ROOT has 1 child - the normal case + Tree child = kids[0]; + if ( ! child.isPhrasal()) { + EncodingPrintWriter.err.println("Correcting error: treebank tree is not phrasal; wrapping in FRAG: " + child, ChineseTreebankLanguagePack.ENCODING); + Tree added = tf.newTreeNode("FRAG", Arrays.asList(kids)); + newTree.setChild(0, added); + } else if (child.label().value().equals("META")) { + // Delete the one bogus META tree in CTB 9 + EncodingPrintWriter.err.println("Deleting META tree that should be XML metadata in chtb_5200.df: " + child, ChineseTreebankLanguagePack.ENCODING); + return null; } + + } else { + EncodingPrintWriter.err.println("Error: tree with no children: " + tree, ChineseTreebankLanguagePack.ENCODING); } + // note that there's also at least 1 tree that is an IP with no surrounding ROOT node // there are also several places where "NP" is used as a preterminal tag @@ -185,14 +226,15 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { Tree subsubtree = subtree.firstChild(); if (subsubtree.value().equals("ROOT")) { if (subsubtree.firstChild().isLeaf() && "CP".equals(subsubtree.firstChild().value())) { - EncodingPrintWriter.err.println("Correcting error: seriously messed up tree in CTB6: " + newTree, ChineseTreebankLanguagePack.ENCODING); + EncodingPrintWriter.err.println("Correcting error: seriously messed up tree in CTB6 (chtb_3095.bn): " + newTree, ChineseTreebankLanguagePack.ENCODING); List children = subsubtree.getChildrenAsList(); children = children.subList(1,children.size()); subtree.setChildren(children); - EncodingPrintWriter.err.println(" Corrected as: " + newTree, ChineseTreebankLanguagePack.ENCODING); // spaced to align with above + EncodingPrintWriter.err.println(" Corrected as: " + newTree, ChineseTreebankLanguagePack.ENCODING); // spaced to align with above } } } + // All the stuff below here seems to have been fixed in CTB 9. Maybe reporting errors sometimes does help. if (subtree.isPreTerminal()) { if (subtree.value().matches("NP")) { if (ChineseTreebankLanguagePack.chineseDouHaoAcceptFilter().test(subtree.firstChild().value())) { @@ -214,9 +256,9 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { subtree.setValue("NN"); } } else if (subtree.value().matches("PU")) { - if (subtree.firstChild().value().matches("\u4ed6")) { + if (subtree.firstChild().value().matches("他")) { if (DEBUG) { - EncodingPrintWriter.err.println("Correcting error: \"\u4ed6\" under PU tag; tag changed to PN: " + subtree, ChineseTreebankLanguagePack.ENCODING); + EncodingPrintWriter.err.println("Correcting error: \"他\" under PU tag; tag changed to PN: " + subtree, ChineseTreebankLanguagePack.ENCODING); } subtree.setValue("PN"); } else if (subtree.firstChild().value().equals("里")) { @@ -229,7 +271,7 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { EncodingPrintWriter.err.println("Correcting error: \"" + subtree.firstChild().value() + "\" under PU tag; tag changed to VC: " + subtree, ChineseTreebankLanguagePack.ENCODING); } subtree.setValue("VC"); - } else if (subtree.firstChild().value().matches("tw|\u534A\u7A74\u5F0F")) { + } else if (subtree.firstChild().value().matches("tw|半穴式")) { if (DEBUG) { EncodingPrintWriter.err.println("Correcting error: \"" + subtree.firstChild().value() + "\" under PU tag; tag changed to NN: " + subtree, ChineseTreebankLanguagePack.ENCODING); } @@ -254,15 +296,17 @@ public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { } } - for (int i = 0; i < splitPuncTregex.length; ++i) { + for (int i = 0; i < fixupTregex.length; ++i) { if (DEBUG) { Tree preProcessed = newTree.deepCopy(); - newTree = Tsurgeon.processPattern(splitPuncTregex[i], splitPuncTsurgeon[i], newTree); + newTree = Tsurgeon.processPattern(fixupTregex[i], fixupTsurgeon[i], newTree); if (!preProcessed.equals(newTree)) { - EncodingPrintWriter.err.println("Correcting error: Updated tree using tregex " + splitPuncTregex[i] + " and tsurgeon " + splitPuncTsurgeon[i], ChineseTreebankLanguagePack.ENCODING); + EncodingPrintWriter.err.println("Correcting error: Updated tree using tregex " + fixupTregex[i] + " and tsurgeon " + fixupTsurgeon[i], ChineseTreebankLanguagePack.ENCODING); + EncodingPrintWriter.err.println(" from: " + preProcessed, ChineseTreebankLanguagePack.ENCODING); + EncodingPrintWriter.err.println(" to: " + newTree, ChineseTreebankLanguagePack.ENCODING); } } else { - newTree = Tsurgeon.processPattern(splitPuncTregex[i], splitPuncTsurgeon[i], newTree); + newTree = Tsurgeon.processPattern(fixupTregex[i], fixupTsurgeon[i], newTree); } } diff --git a/src/edu/stanford/nlp/trees/international/pennchinese/NoEmptiesCTBTreeReaderFactory.java b/src/edu/stanford/nlp/trees/international/pennchinese/NoEmptiesCTBTreeReaderFactory.java index 72b2cea097..71c27af66e 100644 --- a/src/edu/stanford/nlp/trees/international/pennchinese/NoEmptiesCTBTreeReaderFactory.java +++ b/src/edu/stanford/nlp/trees/international/pennchinese/NoEmptiesCTBTreeReaderFactory.java @@ -1,7 +1,9 @@ package edu.stanford.nlp.trees.international.pennchinese; -/** A CTB TreeReaderFactory that deletes empty nodes. +/** A CTB TreeReaderFactory that deletes empty nodes, and makes some corrections + * to trees while reading them in. + * * @author Christopher Manning */ public class NoEmptiesCTBTreeReaderFactory extends CTBTreeReaderFactory { diff --git a/test/src/edu/stanford/nlp/trees/international/pennchinese/CTBErrorCorrectingTreeNormalizerTest.java b/test/src/edu/stanford/nlp/trees/international/pennchinese/CTBErrorCorrectingTreeNormalizerTest.java new file mode 100644 index 0000000000..695fc316f4 --- /dev/null +++ b/test/src/edu/stanford/nlp/trees/international/pennchinese/CTBErrorCorrectingTreeNormalizerTest.java @@ -0,0 +1,51 @@ +package edu.stanford.nlp.trees.international.pennchinese; + +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreeTransformer; +import junit.framework.TestCase; + +/** + * @author Christopher Manning + */ +public class CTBErrorCorrectingTreeNormalizerTest extends TestCase { + + public void testNormalDelete() { + String input = "(ROOT (IP (FLR (PU 〈) (VV turn) (PU 〉)) (PP (ADVP (AD 就)) (PP (P 在) (NP (CP (IP (NP (NR 北韩)) (VP (DVP (VP (VA " + + "积极)) (DEV 的)) (VP (VV 走向) (NP (NN 国际) (NN 社会))))) (DEC 的)) (NP (NN 同时))))) (PU ,) (NP (CP (IP (VP (ADVP (AD" + + " 刚刚)) (VP (VV 渡过) (NP (QP (CD 一) (CLP (M 场))) (NP (NN 大选) (NN 危机)))))) (DEC 的)) (NP (NR 南斯拉夫))) (VP (ADVP" + + " (AD 也)) (ADVP (AD 在)) (VP (VV 寻求) (NP (DNP (LCP (NP (NN 国际)) (LC 间)) (DEG 的)) (NP (NN 协助))))) (PU 。)))"; + String output = "(ROOT (IP (PP (ADVP (AD 就)) (PP (P 在) (NP (CP (IP (NP (NR 北韩)) (VP (DVP (VP (VA " + + "积极)) (DEV 的)) (VP (VV 走向) (NP (NN 国际) (NN 社会))))) (DEC 的)) (NP (NN 同时))))) (PU ,) (NP (CP (IP (VP (ADVP (AD" + + " 刚刚)) (VP (VV 渡过) (NP (QP (CD 一) (CLP (M 场))) (NP (NN 大选) (NN 危机)))))) (DEC 的)) (NP (NR 南斯拉夫))) (VP (ADVP" + + " (AD 也)) (ADVP (AD 在)) (VP (VV 寻求) (NP (DNP (LCP (NP (NN 国际)) (LC 间)) (DEG 的)) (NP (NN 协助))))) (PU 。)))"; + runTest(input, output); + } + + public void testFixSplitElement() { + String input = "(ROOT (IP (IP (NP (NN 下面)) (VP (VV 请) (VP (VV 听) (NP (DNP (NP (NN 报道)) (DEG 的)) (ADJP (JJ 详细)) (NP (NN 内容))))" + + ")) (PU :) (FLR (PU 〈)) (VV turn) (PU 〉) (IP (NP (NP (NP (NR 法国)) (NP (NN 外交) (NN 部长))) (NP (NR 韦里德纳))) " + + "(PU ,) (VP (VC 是) (NP (PP (P 自从) (LCP (IP (NP (NT 去年)) (NP (NR 北约)) (VP (VV 轰炸) (NP (NR 南斯拉夫)))) (LC 以来)" + + ")) (PU ,) (QP (OD 第一) (CLP (M 位))) (CP (IP (VP (VV 访问) (NP (NP (NR 南斯拉夫)) (NP (NN 首都))))) (DEC 的)) (ADJP (JJ" + + " 主要)) (NP (NN 西方) (NN 国家) (NN 外交官))))) (PU 。)))"; + String output = "(ROOT (IP (IP (NP (NN 下面)) (VP (VV 请) (VP (VV 听) (NP (DNP (NP (NN 报道)) (DEG 的)) (ADJP (JJ 详细)) (NP (NN 内容))))" + + ")) (PU :) (IP (NP (NP (NP (NR 法国)) (NP (NN 外交) (NN 部长))) (NP (NR 韦里德纳))) " + + "(PU ,) (VP (VC 是) (NP (PP (P 自从) (LCP (IP (NP (NT 去年)) (NP (NR 北约)) (VP (VV 轰炸) (NP (NR 南斯拉夫)))) (LC 以来)" + + ")) (PU ,) (QP (OD 第一) (CLP (M 位))) (CP (IP (VP (VV 访问) (NP (NP (NR 南斯拉夫)) (NP (NN 首都))))) (DEC 的)) (ADJP (JJ" + + " 主要)) (NP (NN 西方) (NN 国家) (NN 外交官))))) (PU 。)))"; + runTest(input, output); + } + + public void testAnotherSplit() { + String input = "(ROOT (IP (LCP (IP (FLR (PU <) (NR English) (PU >)) (NP (NP (NR APEC) (PU <)) (FLR (PU /) (NT English) (PU >)) (NP (NN 会议))) (VP (VV 举行))) (LC 前)) (PU ,) (NP (NP (NP (NR 日本)) (NP (NN 首相))) (NP (NR 小泉纯一郎))) (VP (VP (QP (OD 第五) (CLP (M 度))) (VP (VV 参拜) (NP (NN 靖国神社)))) (PU ,) (VP (VV 受到) (IP (NP (NP (NR 中) (NR 韩) (ETC 等)) (NP (NR 亚洲)) (NP (NN 国家))) (VP (ADJP (AD 严厉)) (VP (VV 谴责)))))) (PU 。)))"; + String output = "(ROOT (IP (LCP (IP (NP (NP (NR APEC)) (NP (NN 会议))) (VP (VV 举行))) (LC 前)) (PU ,) (NP (NP (NP (NR 日本)) (NP (NN 首相))) (NP (NR 小泉纯一郎))) (VP (VP (QP (OD 第五) (CLP (M 度))) (VP (VV 参拜) (NP (NN 靖国神社)))) (PU ,) (VP (VV 受到) (IP (NP (NP (NR 中) (NR 韩) (ETC 等)) (NP (NR 亚洲)) (NP (NN 国家))) (VP (ADJP (AD 严厉)) (VP (VV 谴责)))))) (PU 。)))"; + runTest(input, output); + } + + private static void runTest(String input, String output) { + Tree inputTree = Tree.valueOf(input); + TreeTransformer tt = new CTBErrorCorrectingTreeNormalizer(false, false, false, false); + Tree outputTree = tt.apply(inputTree); + assertEquals(output, outputTree.toString()); + } + +}