diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java
index 012dd9dc0f..f11ee42d9f 100644
--- a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java
+++ b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java
@@ -139,11 +139,16 @@ public static void moveNode(SemanticGraph sg, SemgrexMatcher sm, IndexedWord wor
}
}
- public static void moveNodes(SemanticGraph sg, SemgrexMatcher sm, Function shouldMove, Function destination) {
+ /**
+ * reverse: operate in reverse order, highest index to first. You want true if moving indices up, false if moving indices down
+ */
+ public static void moveNodes(SemanticGraph sg, SemgrexMatcher sm, Function shouldMove, Function destination, boolean reverse) {
// iterate first, then move, so that we don't screw up the graph while iterating
List toMove = sg.vertexSet().stream().filter(x -> shouldMove.apply(x.index())).collect(Collectors.toList());
Collections.sort(toMove);
- Collections.reverse(toMove);
+ if (reverse) {
+ Collections.reverse(toMove);
+ }
for (IndexedWord word : toMove) {
moveNode(sg, sm, word, destination.apply(word.index()));
}
@@ -166,8 +171,8 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
// +2 to leave room: we will increase all other nodes with the
// proper index, so we need +1 of room, then another +1 for
// a temp place to put this node
- // TODO: when we implement updating the SemgrexMatcher,
- // this won't be necessary
+ // TODO: we could theoretically put the new node in the right place
+ // immediately and move the other nodes, but this is easier
tempIndex = SemanticGraphUtils.maxIndex(sg) + 2;
if (position.equals("-")) {
@@ -203,7 +208,7 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
if (position != null && !position.equals("+")) {
// the payoff for tempIndex == maxIndex + 2:
// everything will be moved one higher, unless it's the new node
- moveNodes(sg, sm, x -> (x >= newIndex && x != tempIndex), x -> x+1);
+ moveNodes(sg, sm, x -> (x >= newIndex && x != tempIndex), x -> x+1, true);
moveNode(sg, sm, newNode, newIndex);
}
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java
new file mode 100644
index 0000000000..f56836704b
--- /dev/null
+++ b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java
@@ -0,0 +1,159 @@
+package edu.stanford.nlp.semgraph.semgrex.ssurgeon;
+
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.ling.IndexedWord;
+import edu.stanford.nlp.semgraph.SemanticGraph;
+import edu.stanford.nlp.semgraph.SemanticGraphEdge;
+import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
+
+/**
+ * Combines two words into one word
+ *
+ * This requires one of the nodes to be the head of a phrase of the words,
+ * and the dependent words can't have any extra edges in or out of that subgraph
+ *
+ * The word and lemma will be the combination of the words, squished together.
+ * Before and after will be updated to use the before and after of the endpoints of the subgraph
+ *
+ * @author John Bauer
+ */
+public class MergeNodes extends SsurgeonEdit {
+ public static final String LABEL = "mergeNodes";
+ final String name1;
+ final String name2;
+ final Map attributes;
+
+ public MergeNodes(String name1, String name2, Map attributes) {
+ this.name1 = name1;
+ this.name2 = name2;
+ this.attributes = new TreeMap<>(attributes);
+ }
+
+ /**
+ * Emits a parseable instruction string.
+ */
+ @Override
+ public String toEditString() {
+ StringWriter buf = new StringWriter();
+ buf.write(LABEL); buf.write("\t");
+ buf.write(name1); buf.write("\t");
+ buf.write(name2);
+
+ // TODO: some attributes might need to be escaped!
+ for (String key : attributes.keySet()) {
+ buf.write("\t-");
+ buf.write(key);
+ buf.write(" ");
+ buf.write(attributes.get(key));
+ }
+
+ return buf.toString();
+ }
+
+ /**
+ * If the two named nodes are next to each other, and the edges of
+ * the graph allow for it, squish the two words into one word
+ */
+ @Override
+ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
+ IndexedWord node1 = sm.getNode(name1);
+ IndexedWord node2 = sm.getNode(name2);
+
+ if (node1 == null || node2 == null) {
+ return false;
+ }
+
+ List n1_to_n2 = sg.getAllEdges(node1, node2);
+ List n2_to_n1 = sg.getAllEdges(node2, node1);
+ if (n1_to_n2.size() == 0 && n2_to_n1.size() == 0) {
+ return false;
+ }
+
+ // TODO: what about the case where the dep is or has copies?
+ final IndexedWord head;
+ final IndexedWord dep;
+
+ if (n1_to_n2.size() > 0) {
+ head = node1;
+ dep = node2;
+ } else {
+ head = node2;
+ dep = node1;
+ }
+
+ // If the dep has any edges that aren't between dep & head, abort
+ // TODO: we could probably make it adjust edges with "dep" as source, instead
+ for (SemanticGraphEdge e : sg.outgoingEdgeIterable(dep)) {
+ if (e.getTarget() != head) {
+ return false;
+ }
+ }
+ for (SemanticGraphEdge e : sg.incomingEdgeIterable(dep)) {
+ if (e.getSource() != head) {
+ return false;
+ }
+ }
+
+ IndexedWord left;
+ IndexedWord right;
+ if (node1.index() < node2.index()) {
+ left = node1;
+ right = node2;
+ } else {
+ left = node2;
+ right = node1;
+ }
+
+ CoreLabel newLabel = AddDep.fromCheapStrings(attributes);
+ if (newLabel.word() == null) {
+ String newWord = left.word() + right.word();
+ newLabel.setWord(newWord);
+ }
+ if (newLabel.value() == null) {
+ newLabel.setValue(newLabel.word());
+ }
+ if (newLabel.lemma() == null) {
+ String newLemma = left.lemma() != null && right.lemma() != null ? left.lemma() + right.lemma() : null;
+ newLabel.setLemma(newLemma);
+ }
+ // after() and before() return "" if null, so we need to use the CoreAnnotations directly
+ if (newLabel.get(CoreAnnotations.AfterAnnotation.class) == null) {
+ newLabel.setAfter(right.after());
+ }
+ if (newLabel.get(CoreAnnotations.BeforeAnnotation.class) == null) {
+ newLabel.setBefore(right.before());
+ }
+
+ for (IndexedWord vertex : sg.vertexSet()) {
+ if (vertex.index() == head.index()) {
+ for (Class key : newLabel.keySet()) {
+ Object value = newLabel.get(key);
+ vertex.set(key, value);
+ }
+ }
+ }
+
+ // copy the list so that deletion doesn't hurt the iterator
+ // TODO: super fancy would be implementing iterator.remove()
+ // on the Set returned by the SemanticGraph
+ for (IndexedWord vertex : sg.vertexListSorted()) {
+ if (vertex.index() == dep.index()) {
+ sg.removeVertex(vertex);
+ }
+ }
+
+ // reindex everyone
+ AddDep.moveNodes(sg, sm, x -> (x >= dep.index()), x -> x-1, false);
+
+ return true;
+ }
+
+}
+
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
index 014f8c2233..97a785e740 100644
--- a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
+++ b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
@@ -82,6 +82,7 @@
* {@code addDep -gov node1 -reln depType -position where ...attributes...}
* {@code editNode -node node ...attributes...}
* {@code setRoots n1 (n2 n3 ...)}
+ * {@code mergeNodes n1 n2}
* {@code killAllIncomingEdges -node node}
* {@code deleteGraphFromNode -node node}
* {@code killNonRootedNodes}
@@ -134,6 +135,11 @@
* This is best done in conjunction with other operations which actually manipulate the structure
* of the graph, or the new root will weirdly have dependents and the graph will be incorrect.
*
+ * {@code mergeNodes} will merge n1 and n2, assuming they are mergeable.
+ * The nodes can be merged if one of the nodes is the head of a phrase
+ * and the other node depends on the head. TODO: can make it process
+ * more than two nodes at once.
+ *
* {@code killAllIncomingEdges} deletes all edges to a node.
* {@code -node} is the node to edit.
* Note that this is the same as {@code removeEdge} with only the dependent set.
@@ -496,6 +502,13 @@ public static SsurgeonEdit parseEditLine(String editLine, Map at
String[] names = tuples1[1].split("\\s+");
List newRoots = Arrays.asList(names);
return new SetRoots(newRoots);
+ } else if (command.equalsIgnoreCase(MergeNodes.LABEL)) {
+ String[] names = tuples1[1].split("\\s+", 3);
+ if (names.length == 2 && attributeArgs.size() == 0) {
+ return new MergeNodes(names[0], names[1], Collections.emptyMap());
+ }
+ final SsurgeonArgs argsBox = parseArgsBox(names.length == 2 ? "" : names[2], attributeArgs);
+ return new MergeNodes(names[0], names[1], argsBox.annotations);
} else if (command.equalsIgnoreCase(KillNonRootedNodes.LABEL)) {
return new KillNonRootedNodes();
}
diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
index 633d6ff573..0dfe69beb9 100644
--- a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
+++ b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
@@ -1063,6 +1063,147 @@ public void readXMLAddDepBrokenAnnotation() {
}
}
+ /**
+ * Test a basic case of two nodes that should be merged
+ *
+ * The indices should be changed as well
+ */
+ @Test
+ public void readXMLMergeNodes() {
+ Ssurgeon inst = Ssurgeon.inst();
+
+ // Test the head word being the first word
+ String merge = String.join(newline,
+ "",
+ " ",
+ " 38",
+ " Merge two nodes that should not have been split",
+ " " + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "",
+ " mergeNodes source punct",
+ " ",
+ "");
+ List patterns = inst.readFromString(merge);
+ assertEquals(patterns.size(), 1);
+ SsurgeonPattern mergeSsurgeon = patterns.get(0);
+
+ SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+ SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
+ SemanticGraph expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [prof.-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
+ assertEquals(expected, newSG);
+ IndexedWord prof = sg.getNodeByIndexSafe(3);
+ assertNotNull(prof);
+ assertEquals("prof.", prof.word());
+ assertEquals("prof.", prof.value());
+ assertNull(prof.lemma());
+
+ // Same test, but this time test merging the lemmas
+ sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+ sg.getNodeByIndexSafe(3).setLemma("prof");
+ sg.getNodeByIndexSafe(4).setLemma(".");
+ newSG = mergeSsurgeon.iterate(sg).first;
+ assertEquals(expected, newSG);
+ prof = sg.getNodeByIndexSafe(3);
+ assertEquals("prof.", prof.lemma());
+
+ // Test the head word being the second word
+ merge = String.join(newline,
+ "",
+ " ",
+ " 38",
+ " Merge two nodes that should not have been split",
+ " " + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "",
+ " mergeNodes source punct",
+ " ",
+ "");
+ patterns = inst.readFromString(merge);
+ assertEquals(patterns.size(), 1);
+ mergeSsurgeon = patterns.get(0);
+
+ // Check what happens if the root of the phrase is on the right and the dep is on the left
+ // The words & lemmas should still hopefully be merged in order
+ sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-4 det> Il-2 punct> .-3 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+ sg.getNodeByIndexSafe(3).setLemma(".");
+ assertEquals(".", sg.getNodeByIndexSafe(3).word());
+ sg.getNodeByIndexSafe(4).setLemma("prof");
+ newSG = mergeSsurgeon.iterate(sg).first;
+ expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [.prof-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
+ assertEquals(expected, newSG);
+ prof = newSG.getNodeByIndexSafe(3);
+ assertEquals(".prof", prof.word());
+ assertEquals(".prof", prof.lemma());
+ }
+
+
+ /**
+ * Test a basic case of two nodes that should be merged
+ *
+ * The indices should be changed as well
+ */
+ @Test
+ public void readXMLMergeNodesAttributes() {
+ Ssurgeon inst = Ssurgeon.inst();
+
+ // Test the head word being the first word
+ String merge = String.join(newline,
+ "",
+ " ",
+ " 38",
+ " Merge two nodes that should not have been split",
+ " " + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "",
+ " mergeNodes source punct -word foo -lemma bar",
+ " ",
+ "");
+ List patterns = inst.readFromString(merge);
+ assertEquals(patterns.size(), 1);
+ SsurgeonPattern mergeSsurgeon = patterns.get(0);
+
+ SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+ SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
+ SemanticGraph expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [foo-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
+ assertEquals(expected, newSG);
+ IndexedWord prof = sg.getNodeByIndexSafe(3);
+ assertNotNull(prof);
+ assertEquals("foo", prof.word());
+ assertEquals("foo", prof.value());
+ assertEquals("bar", prof.lemma());
+ }
+
+ /**
+ * Test a basic case of two nodes that should be merged
+ *
+ * The indices should be changed as well
+ */
+ @Test
+ public void readXMLMergeNodesFailCases() {
+ Ssurgeon inst = Ssurgeon.inst();
+
+ // use "dep" as the dependency so as to be language-agnostic in this test
+ String merge = String.join(newline,
+ "",
+ " ",
+ " 38",
+ " Merge two nodes that should not have been split",
+ " " + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "",
+ " mergeNodes source punct",
+ " ",
+ "");
+ List patterns = inst.readFromString(merge);
+ assertEquals(patterns.size(), 1);
+ SsurgeonPattern mergeSsurgeon = patterns.get(0);
+
+ // Add an extra edge from the punct we want to squash to somewhere else
+ // The graph should not be changed
+ SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> Fotticchia-5 punct> [.-4 nmod> Fotticchia-5]] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+ SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
+ SemanticGraph expected = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> Fotticchia-5 punct> [.-4 nmod> Fotticchia-5]] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+ assertEquals(expected, newSG);
+
+ sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> [Fotticchia-5 punct> .-4] punct> .-4] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+ newSG = mergeSsurgeon.iterate(sg).first;
+ expected = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> [Fotticchia-5 punct> .-4] punct> .-4] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+ assertEquals(expected, newSG);
+ }
+
/**
* The AddDep should update the matches in the SemgrexMatcher.
* If that isn't done correctly, then moving the words first