diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java index 012dd9dc0f..f11ee42d9f 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java @@ -139,11 +139,16 @@ public static void moveNode(SemanticGraph sg, SemgrexMatcher sm, IndexedWord wor } } - public static void moveNodes(SemanticGraph sg, SemgrexMatcher sm, Function shouldMove, Function destination) { + /** + * reverse: operate in reverse order, highest index to first. You want true if moving indices up, false if moving indices down + */ + public static void moveNodes(SemanticGraph sg, SemgrexMatcher sm, Function shouldMove, Function destination, boolean reverse) { // iterate first, then move, so that we don't screw up the graph while iterating List toMove = sg.vertexSet().stream().filter(x -> shouldMove.apply(x.index())).collect(Collectors.toList()); Collections.sort(toMove); - Collections.reverse(toMove); + if (reverse) { + Collections.reverse(toMove); + } for (IndexedWord word : toMove) { moveNode(sg, sm, word, destination.apply(word.index())); } @@ -166,8 +171,8 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) { // +2 to leave room: we will increase all other nodes with the // proper index, so we need +1 of room, then another +1 for // a temp place to put this node - // TODO: when we implement updating the SemgrexMatcher, - // this won't be necessary + // TODO: we could theoretically put the new node in the right place + // immediately and move the other nodes, but this is easier tempIndex = SemanticGraphUtils.maxIndex(sg) + 2; if (position.equals("-")) { @@ -203,7 +208,7 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) { if (position != null && !position.equals("+")) { // the payoff for tempIndex == maxIndex + 2: // everything will be moved one higher, unless it's the new node - moveNodes(sg, sm, x -> (x >= newIndex && x != tempIndex), x -> x+1); + moveNodes(sg, sm, x -> (x >= newIndex && x != tempIndex), x -> x+1, true); moveNode(sg, sm, newNode, newIndex); } diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java new file mode 100644 index 0000000000..f56836704b --- /dev/null +++ b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java @@ -0,0 +1,159 @@ +package edu.stanford.nlp.semgraph.semgrex.ssurgeon; + +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.ling.IndexedWord; +import edu.stanford.nlp.semgraph.SemanticGraph; +import edu.stanford.nlp.semgraph.SemanticGraphEdge; +import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher; + +/** + * Combines two words into one word + *
+ * This requires one of the nodes to be the head of a phrase of the words, + * and the dependent words can't have any extra edges in or out of that subgraph + *
+ * The word and lemma will be the combination of the words, squished together. + * Before and after will be updated to use the before and after of the endpoints of the subgraph + * + * @author John Bauer + */ +public class MergeNodes extends SsurgeonEdit { + public static final String LABEL = "mergeNodes"; + final String name1; + final String name2; + final Map attributes; + + public MergeNodes(String name1, String name2, Map attributes) { + this.name1 = name1; + this.name2 = name2; + this.attributes = new TreeMap<>(attributes); + } + + /** + * Emits a parseable instruction string. + */ + @Override + public String toEditString() { + StringWriter buf = new StringWriter(); + buf.write(LABEL); buf.write("\t"); + buf.write(name1); buf.write("\t"); + buf.write(name2); + + // TODO: some attributes might need to be escaped! + for (String key : attributes.keySet()) { + buf.write("\t-"); + buf.write(key); + buf.write(" "); + buf.write(attributes.get(key)); + } + + return buf.toString(); + } + + /** + * If the two named nodes are next to each other, and the edges of + * the graph allow for it, squish the two words into one word + */ + @Override + public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) { + IndexedWord node1 = sm.getNode(name1); + IndexedWord node2 = sm.getNode(name2); + + if (node1 == null || node2 == null) { + return false; + } + + List n1_to_n2 = sg.getAllEdges(node1, node2); + List n2_to_n1 = sg.getAllEdges(node2, node1); + if (n1_to_n2.size() == 0 && n2_to_n1.size() == 0) { + return false; + } + + // TODO: what about the case where the dep is or has copies? + final IndexedWord head; + final IndexedWord dep; + + if (n1_to_n2.size() > 0) { + head = node1; + dep = node2; + } else { + head = node2; + dep = node1; + } + + // If the dep has any edges that aren't between dep & head, abort + // TODO: we could probably make it adjust edges with "dep" as source, instead + for (SemanticGraphEdge e : sg.outgoingEdgeIterable(dep)) { + if (e.getTarget() != head) { + return false; + } + } + for (SemanticGraphEdge e : sg.incomingEdgeIterable(dep)) { + if (e.getSource() != head) { + return false; + } + } + + IndexedWord left; + IndexedWord right; + if (node1.index() < node2.index()) { + left = node1; + right = node2; + } else { + left = node2; + right = node1; + } + + CoreLabel newLabel = AddDep.fromCheapStrings(attributes); + if (newLabel.word() == null) { + String newWord = left.word() + right.word(); + newLabel.setWord(newWord); + } + if (newLabel.value() == null) { + newLabel.setValue(newLabel.word()); + } + if (newLabel.lemma() == null) { + String newLemma = left.lemma() != null && right.lemma() != null ? left.lemma() + right.lemma() : null; + newLabel.setLemma(newLemma); + } + // after() and before() return "" if null, so we need to use the CoreAnnotations directly + if (newLabel.get(CoreAnnotations.AfterAnnotation.class) == null) { + newLabel.setAfter(right.after()); + } + if (newLabel.get(CoreAnnotations.BeforeAnnotation.class) == null) { + newLabel.setBefore(right.before()); + } + + for (IndexedWord vertex : sg.vertexSet()) { + if (vertex.index() == head.index()) { + for (Class key : newLabel.keySet()) { + Object value = newLabel.get(key); + vertex.set(key, value); + } + } + } + + // copy the list so that deletion doesn't hurt the iterator + // TODO: super fancy would be implementing iterator.remove() + // on the Set returned by the SemanticGraph + for (IndexedWord vertex : sg.vertexListSorted()) { + if (vertex.index() == dep.index()) { + sg.removeVertex(vertex); + } + } + + // reindex everyone + AddDep.moveNodes(sg, sm, x -> (x >= dep.index()), x -> x-1, false); + + return true; + } + +} + diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java index 014f8c2233..97a785e740 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java @@ -82,6 +82,7 @@ *
  • {@code addDep -gov node1 -reln depType -position where ...attributes...} *
  • {@code editNode -node node ...attributes...} *
  • {@code setRoots n1 (n2 n3 ...)} + *
  • {@code mergeNodes n1 n2} *
  • {@code killAllIncomingEdges -node node} *
  • {@code deleteGraphFromNode -node node} *
  • {@code killNonRootedNodes} @@ -134,6 +135,11 @@ * This is best done in conjunction with other operations which actually manipulate the structure * of the graph, or the new root will weirdly have dependents and the graph will be incorrect. *

    + * {@code mergeNodes} will merge n1 and n2, assuming they are mergeable. + * The nodes can be merged if one of the nodes is the head of a phrase + * and the other node depends on the head. TODO: can make it process + * more than two nodes at once. + *

    * {@code killAllIncomingEdges} deletes all edges to a node. * {@code -node} is the node to edit. * Note that this is the same as {@code removeEdge} with only the dependent set. @@ -496,6 +502,13 @@ public static SsurgeonEdit parseEditLine(String editLine, Map at String[] names = tuples1[1].split("\\s+"); List newRoots = Arrays.asList(names); return new SetRoots(newRoots); + } else if (command.equalsIgnoreCase(MergeNodes.LABEL)) { + String[] names = tuples1[1].split("\\s+", 3); + if (names.length == 2 && attributeArgs.size() == 0) { + return new MergeNodes(names[0], names[1], Collections.emptyMap()); + } + final SsurgeonArgs argsBox = parseArgsBox(names.length == 2 ? "" : names[2], attributeArgs); + return new MergeNodes(names[0], names[1], argsBox.annotations); } else if (command.equalsIgnoreCase(KillNonRootedNodes.LABEL)) { return new KillNonRootedNodes(); } diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java index 633d6ff573..0dfe69beb9 100644 --- a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java +++ b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java @@ -1063,6 +1063,147 @@ public void readXMLAddDepBrokenAnnotation() { } } + /** + * Test a basic case of two nodes that should be merged + *
    + * The indices should be changed as well + */ + @Test + public void readXMLMergeNodes() { + Ssurgeon inst = Ssurgeon.inst(); + + // Test the head word being the first word + String merge = String.join(newline, + "", + " ", + " 38", + " Merge two nodes that should not have been split", + " " + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "", + " mergeNodes source punct", + " ", + ""); + List patterns = inst.readFromString(merge); + assertEquals(patterns.size(), 1); + SsurgeonPattern mergeSsurgeon = patterns.get(0); + + SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish); + SemanticGraph newSG = mergeSsurgeon.iterate(sg).first; + SemanticGraph expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [prof.-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish); + assertEquals(expected, newSG); + IndexedWord prof = sg.getNodeByIndexSafe(3); + assertNotNull(prof); + assertEquals("prof.", prof.word()); + assertEquals("prof.", prof.value()); + assertNull(prof.lemma()); + + // Same test, but this time test merging the lemmas + sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish); + sg.getNodeByIndexSafe(3).setLemma("prof"); + sg.getNodeByIndexSafe(4).setLemma("."); + newSG = mergeSsurgeon.iterate(sg).first; + assertEquals(expected, newSG); + prof = sg.getNodeByIndexSafe(3); + assertEquals("prof.", prof.lemma()); + + // Test the head word being the second word + merge = String.join(newline, + "", + " ", + " 38", + " Merge two nodes that should not have been split", + " " + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "", + " mergeNodes source punct", + " ", + ""); + patterns = inst.readFromString(merge); + assertEquals(patterns.size(), 1); + mergeSsurgeon = patterns.get(0); + + // Check what happens if the root of the phrase is on the right and the dep is on the left + // The words & lemmas should still hopefully be merged in order + sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-4 det> Il-2 punct> .-3 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish); + sg.getNodeByIndexSafe(3).setLemma("."); + assertEquals(".", sg.getNodeByIndexSafe(3).word()); + sg.getNodeByIndexSafe(4).setLemma("prof"); + newSG = mergeSsurgeon.iterate(sg).first; + expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [.prof-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish); + assertEquals(expected, newSG); + prof = newSG.getNodeByIndexSafe(3); + assertEquals(".prof", prof.word()); + assertEquals(".prof", prof.lemma()); + } + + + /** + * Test a basic case of two nodes that should be merged + *
    + * The indices should be changed as well + */ + @Test + public void readXMLMergeNodesAttributes() { + Ssurgeon inst = Ssurgeon.inst(); + + // Test the head word being the first word + String merge = String.join(newline, + "", + " ", + " 38", + " Merge two nodes that should not have been split", + " " + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "", + " mergeNodes source punct -word foo -lemma bar", + " ", + ""); + List patterns = inst.readFromString(merge); + assertEquals(patterns.size(), 1); + SsurgeonPattern mergeSsurgeon = patterns.get(0); + + SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish); + SemanticGraph newSG = mergeSsurgeon.iterate(sg).first; + SemanticGraph expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [foo-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish); + assertEquals(expected, newSG); + IndexedWord prof = sg.getNodeByIndexSafe(3); + assertNotNull(prof); + assertEquals("foo", prof.word()); + assertEquals("foo", prof.value()); + assertEquals("bar", prof.lemma()); + } + + /** + * Test a basic case of two nodes that should be merged + *
    + * The indices should be changed as well + */ + @Test + public void readXMLMergeNodesFailCases() { + Ssurgeon inst = Ssurgeon.inst(); + + // use "dep" as the dependency so as to be language-agnostic in this test + String merge = String.join(newline, + "", + " ", + " 38", + " Merge two nodes that should not have been split", + " " + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "", + " mergeNodes source punct", + " ", + ""); + List patterns = inst.readFromString(merge); + assertEquals(patterns.size(), 1); + SsurgeonPattern mergeSsurgeon = patterns.get(0); + + // Add an extra edge from the punct we want to squash to somewhere else + // The graph should not be changed + SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> Fotticchia-5 punct> [.-4 nmod> Fotticchia-5]] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish); + SemanticGraph newSG = mergeSsurgeon.iterate(sg).first; + SemanticGraph expected = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> Fotticchia-5 punct> [.-4 nmod> Fotticchia-5]] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish); + assertEquals(expected, newSG); + + sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> [Fotticchia-5 punct> .-4] punct> .-4] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish); + newSG = mergeSsurgeon.iterate(sg).first; + expected = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> [Fotticchia-5 punct> .-4] punct> .-4] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish); + assertEquals(expected, newSG); + } + /** * The AddDep should update the matches in the SemgrexMatcher. * If that isn't done correctly, then moving the words first