Add a two-word MergeNodes operation.

MergeNodes operation allows for arbitrary attributes on the new word, or makes a best effort at putting a combined word & lemma on the new combined word if nothing was specified. Would like to extend it to handle multiple words at once
stanfordnlp · Mar 26, 2023 · 0660fa9 · 0660fa9
1 parent db0bd45
commit 0660fa9
Show file tree

Hide file tree

Showing 4 changed files with 323 additions and 5 deletions.
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/AddDep.java
@@ -139,11 +139,16 @@ public static void moveNode(SemanticGraph sg, SemgrexMatcher sm, IndexedWord wor
     }
   }
 
-  public static void moveNodes(SemanticGraph sg, SemgrexMatcher sm, Function<Integer, Boolean> shouldMove, Function<Integer, Integer> destination) {
+  /**
+   * reverse: operate in reverse order, highest index to first.  You want true if moving indices up, false if moving indices down
+   */
+  public static void moveNodes(SemanticGraph sg, SemgrexMatcher sm, Function<Integer, Boolean> shouldMove, Function<Integer, Integer> destination, boolean reverse) {
     // iterate first, then move, so that we don't screw up the graph while iterating
     List<IndexedWord> toMove = sg.vertexSet().stream().filter(x -> shouldMove.apply(x.index())).collect(Collectors.toList());
     Collections.sort(toMove);
-    Collections.reverse(toMove);
+    if (reverse) {
+      Collections.reverse(toMove);
+    }
     for (IndexedWord word : toMove) {
       moveNode(sg, sm, word, destination.apply(word.index()));
     }
@@ -166,8 +171,8 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
       // +2 to leave room: we will increase all other nodes with the
       // proper index, so we need +1 of room, then another +1 for
       // a temp place to put this node
-      // TODO: when we implement updating the SemgrexMatcher,
-      // this won't be necessary
+      // TODO: we could theoretically put the new node in the right place
+      // immediately and move the other nodes, but this is easier
       tempIndex = SemanticGraphUtils.maxIndex(sg) + 2;
 
       if (position.equals("-")) {
@@ -203,7 +208,7 @@ public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
     if (position != null && !position.equals("+")) {
       // the payoff for tempIndex == maxIndex + 2:
       // everything will be moved one higher, unless it's the new node
-      moveNodes(sg, sm, x -> (x >= newIndex && x != tempIndex), x -> x+1);
+      moveNodes(sg, sm, x -> (x >= newIndex && x != tempIndex), x -> x+1, true);
       moveNode(sg, sm, newNode, newIndex);
     }
 

diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java
@@ -0,0 +1,159 @@
+package edu.stanford.nlp.semgraph.semgrex.ssurgeon;
+
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.ling.IndexedWord;
+import edu.stanford.nlp.semgraph.SemanticGraph;
+import edu.stanford.nlp.semgraph.SemanticGraphEdge;
+import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
+
+/**
+ * Combines two words into one word
+ *<br>
+ * This requires one of the nodes to be the head of a phrase of the words,
+ * and the dependent words can't have any extra edges in or out of that subgraph
+ *<br>
+ * The word and lemma will be the combination of the words, squished together.
+ * Before and after will be updated to use the before and after of the endpoints of the subgraph
+ *
+ * @author John Bauer
+ */
+public class MergeNodes extends SsurgeonEdit {
+  public static final String LABEL = "mergeNodes";
+  final String name1;
+  final String name2;
+  final Map<String, String> attributes;
+
+  public MergeNodes(String name1, String name2, Map<String, String> attributes) {
+    this.name1 = name1;
+    this.name2 = name2;
+    this.attributes = new TreeMap<>(attributes);
+  }
+
+  /**
+   * Emits a parseable instruction string.
+   */
+  @Override
+  public String toEditString() {
+    StringWriter buf = new StringWriter();
+    buf.write(LABEL);  buf.write("\t");
+    buf.write(name1);  buf.write("\t");
+    buf.write(name2);
+
+    // TODO: some attributes might need to be escaped!
+    for (String key : attributes.keySet()) {
+      buf.write("\t-");
+      buf.write(key);
+      buf.write(" ");
+      buf.write(attributes.get(key));
+    }
+
+    return buf.toString();
+  }
+
+  /**
+   * If the two named nodes are next to each other, and the edges of
+   * the graph allow for it, squish the two words into one word
+   */
+  @Override
+  public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
+    IndexedWord node1 = sm.getNode(name1);
+    IndexedWord node2 = sm.getNode(name2);
+
+    if (node1 == null || node2 == null) {
+      return false;
+    }
+
+    List<SemanticGraphEdge> n1_to_n2 = sg.getAllEdges(node1, node2);
+    List<SemanticGraphEdge> n2_to_n1 = sg.getAllEdges(node2, node1);
+    if (n1_to_n2.size() == 0 && n2_to_n1.size() == 0) {
+      return false;
+    }
+
+    // TODO: what about the case where the dep is or has copies?
+    final IndexedWord head;
+    final IndexedWord dep;
+
+    if (n1_to_n2.size() > 0) {
+      head = node1;
+      dep = node2;
+    } else {
+      head = node2;
+      dep = node1;
+    }
+
+    // If the dep has any edges that aren't between dep & head, abort
+    // TODO: we could probably make it adjust edges with "dep" as source, instead
+    for (SemanticGraphEdge e : sg.outgoingEdgeIterable(dep)) {
+      if (e.getTarget() != head) {
+        return false;
+      }
+    }
+    for (SemanticGraphEdge e : sg.incomingEdgeIterable(dep)) {
+      if (e.getSource() != head) {
+        return false;
+      }
+    }
+
+    IndexedWord left;
+    IndexedWord right;
+    if (node1.index() < node2.index()) {
+      left = node1;
+      right = node2;
+    } else {
+      left = node2;
+      right = node1;
+    }
+
+    CoreLabel newLabel = AddDep.fromCheapStrings(attributes);
+    if (newLabel.word() == null) {
+      String newWord = left.word() + right.word();
+      newLabel.setWord(newWord);
+    }
+    if (newLabel.value() == null) {
+      newLabel.setValue(newLabel.word());
+    }
+    if (newLabel.lemma() == null) {
+      String newLemma = left.lemma() != null && right.lemma() != null ? left.lemma() + right.lemma() : null;
+      newLabel.setLemma(newLemma);
+    }
+    // after() and before() return "" if null, so we need to use the CoreAnnotations directly
+    if (newLabel.get(CoreAnnotations.AfterAnnotation.class) == null) {
+      newLabel.setAfter(right.after());
+    }
+    if (newLabel.get(CoreAnnotations.BeforeAnnotation.class) == null) {
+      newLabel.setBefore(right.before());
+    }
+
+    for (IndexedWord vertex : sg.vertexSet()) {
+      if (vertex.index() == head.index()) {
+        for (Class key : newLabel.keySet()) {
+          Object value = newLabel.get(key);
+          vertex.set(key, value);
+        }
+      }
+    }
+
+    // copy the list so that deletion doesn't hurt the iterator
+    // TODO: super fancy would be implementing iterator.remove()
+    // on the Set returned by the SemanticGraph
+    for (IndexedWord vertex : sg.vertexListSorted()) {
+      if (vertex.index() == dep.index()) {
+        sg.removeVertex(vertex);
+      }
+    }
+
+    // reindex everyone
+    AddDep.moveNodes(sg, sm, x -> (x >= dep.index()), x -> x-1, false);
+
+    return true;
+  }
+
+}
+
diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
@@ -82,6 +82,7 @@
  * <li> {@code addDep -gov node1 -reln depType -position where ...attributes...}
  * <li> {@code editNode -node node ...attributes...}
  * <li> {@code setRoots n1 (n2 n3 ...)}
+ * <li> {@code mergeNodes n1 n2}
  * <li> {@code killAllIncomingEdges -node node}
  * <li> {@code deleteGraphFromNode -node node}
  * <li> {@code killNonRootedNodes}
@@ -134,6 +135,11 @@
  * This is best done in conjunction with other operations which actually manipulate the structure
  * of the graph, or the new root will weirdly have dependents and the graph will be incorrect.
  *</p><p>
+ * {@code mergeNodes} will merge n1 and n2, assuming they are  mergeable.
+ * The nodes can be merged if one of the nodes is the head of a phrase
+ * and the other node depends on the head.  TODO: can make it process
+ * more than two nodes at once.
+ *</p><p>
  * {@code killAllIncomingEdges} deletes all edges to a node.
  * {@code -node} is the node to edit.
  * Note that this is the same as {@code removeEdge} with only the dependent set.
@@ -496,6 +502,13 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
         String[] names = tuples1[1].split("\\s+");
         List<String> newRoots = Arrays.asList(names);
         return new SetRoots(newRoots);
+      } else if (command.equalsIgnoreCase(MergeNodes.LABEL)) {
+        String[] names = tuples1[1].split("\\s+", 3);
+        if (names.length == 2 && attributeArgs.size() == 0) {
+          return new MergeNodes(names[0], names[1], Collections.emptyMap());
+        }
+        final SsurgeonArgs argsBox = parseArgsBox(names.length == 2 ? "" : names[2], attributeArgs);
+        return new MergeNodes(names[0], names[1], argsBox.annotations);
       } else if (command.equalsIgnoreCase(KillNonRootedNodes.LABEL)) {
         return new KillNonRootedNodes();
       }

diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonTest.java
@@ -1063,6 +1063,147 @@ public void readXMLAddDepBrokenAnnotation() {
     }
   }
 
+  /**
+   * Test a basic case of two nodes that should be merged
+   *<br>
+   * The indices should be changed as well
+   */
+  @Test
+  public void readXMLMergeNodes() {
+    Ssurgeon inst = Ssurgeon.inst();
+
+    // Test the head word being the first word
+    String merge = String.join(newline,
+                               "<ssurgeon-pattern-list>",
+                               "  <ssurgeon-pattern>",
+                               "    <uid>38</uid>",
+                               "    <notes>Merge two nodes that should not have been split</notes>",
+                               "    <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
+                               "    <edit-list>mergeNodes source punct</edit-list>",
+                               "  </ssurgeon-pattern>",
+                               "</ssurgeon-pattern-list>");
+    List<SsurgeonPattern> patterns = inst.readFromString(merge);
+    assertEquals(patterns.size(), 1);
+    SsurgeonPattern mergeSsurgeon = patterns.get(0);
+
+    SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+    SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
+    SemanticGraph expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [prof.-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
+    assertEquals(expected, newSG);
+    IndexedWord prof = sg.getNodeByIndexSafe(3);
+    assertNotNull(prof);
+    assertEquals("prof.", prof.word());
+    assertEquals("prof.", prof.value());
+    assertNull(prof.lemma());
+
+    // Same test, but this time test merging the lemmas
+    sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+    sg.getNodeByIndexSafe(3).setLemma("prof");
+    sg.getNodeByIndexSafe(4).setLemma(".");
+    newSG = mergeSsurgeon.iterate(sg).first;
+    assertEquals(expected, newSG);
+    prof = sg.getNodeByIndexSafe(3);
+    assertEquals("prof.", prof.lemma());
+
+    // Test the head word being the second word
+    merge = String.join(newline,
+                        "<ssurgeon-pattern-list>",
+                        "  <ssurgeon-pattern>",
+                        "    <uid>38</uid>",
+                        "    <notes>Merge two nodes that should not have been split</notes>",
+                        "    <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
+                        "    <edit-list>mergeNodes source punct</edit-list>",
+                        "  </ssurgeon-pattern>",
+                        "</ssurgeon-pattern-list>");
+    patterns = inst.readFromString(merge);
+    assertEquals(patterns.size(), 1);
+    mergeSsurgeon = patterns.get(0);
+
+    // Check what happens if the root of the phrase is on the right and the dep is on the left
+    // The words & lemmas should still hopefully be merged in order
+    sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-4 det> Il-2 punct> .-3 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+    sg.getNodeByIndexSafe(3).setLemma(".");
+    assertEquals(".", sg.getNodeByIndexSafe(3).word());
+    sg.getNodeByIndexSafe(4).setLemma("prof");
+    newSG = mergeSsurgeon.iterate(sg).first;
+    expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [.prof-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
+    assertEquals(expected, newSG);
+    prof = newSG.getNodeByIndexSafe(3);
+    assertEquals(".prof", prof.word());
+    assertEquals(".prof", prof.lemma());
+  }
+
+
+  /**
+   * Test a basic case of two nodes that should be merged
+   *<br>
+   * The indices should be changed as well
+   */
+  @Test
+  public void readXMLMergeNodesAttributes() {
+    Ssurgeon inst = Ssurgeon.inst();
+
+    // Test the head word being the first word
+    String merge = String.join(newline,
+                               "<ssurgeon-pattern-list>",
+                               "  <ssurgeon-pattern>",
+                               "    <uid>38</uid>",
+                               "    <notes>Merge two nodes that should not have been split</notes>",
+                               "    <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
+                               "    <edit-list>mergeNodes source punct -word foo -lemma bar</edit-list>",
+                               "  </ssurgeon-pattern>",
+                               "</ssurgeon-pattern-list>");
+    List<SsurgeonPattern> patterns = inst.readFromString(merge);
+    assertEquals(patterns.size(), 1);
+    SsurgeonPattern mergeSsurgeon = patterns.get(0);
+
+    SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 punct> .-4 nmod> Fotticchia-5] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+    SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
+    SemanticGraph expected = SemanticGraph.valueOf("[fare-6 aux> potrebbe-5 nsubj> [foo-3 det> Il-2 nmod> Fotticchia-4] obj> [gag-8 det> una-7] obl> [situazione-11 case> su-9 det> la-10]]", Language.UniversalEnglish);
+    assertEquals(expected, newSG);
+    IndexedWord prof = sg.getNodeByIndexSafe(3);
+    assertNotNull(prof);
+    assertEquals("foo", prof.word());
+    assertEquals("foo", prof.value());
+    assertEquals("bar", prof.lemma());
+  }
+
+  /**
+   * Test a basic case of two nodes that should be merged
+   *<br>
+   * The indices should be changed as well
+   */
+  @Test
+  public void readXMLMergeNodesFailCases() {
+    Ssurgeon inst = Ssurgeon.inst();
+
+    // use "dep" as the dependency so as to be language-agnostic in this test
+    String merge = String.join(newline,
+                               "<ssurgeon-pattern-list>",
+                               "  <ssurgeon-pattern>",
+                               "    <uid>38</uid>",
+                               "    <notes>Merge two nodes that should not have been split</notes>",
+                               "    <semgrex>" + XMLUtils.escapeXML("{word:prof}=source >punct ({}=punct . {} !> {})") + "</semgrex>",
+                               "    <edit-list>mergeNodes source punct</edit-list>",
+                               "  </ssurgeon-pattern>",
+                               "</ssurgeon-pattern-list>");
+    List<SsurgeonPattern> patterns = inst.readFromString(merge);
+    assertEquals(patterns.size(), 1);
+    SsurgeonPattern mergeSsurgeon = patterns.get(0);
+
+    // Add an extra edge from the punct we want to squash to somewhere else
+    // The graph should not be changed
+    SemanticGraph sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> Fotticchia-5 punct> [.-4 nmod> Fotticchia-5]] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+    SemanticGraph newSG = mergeSsurgeon.iterate(sg).first;
+    SemanticGraph expected = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> Fotticchia-5 punct> [.-4 nmod> Fotticchia-5]] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+    assertEquals(expected, newSG);
+
+    sg = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> [Fotticchia-5 punct> .-4] punct> .-4] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+    newSG = mergeSsurgeon.iterate(sg).first;
+    expected = SemanticGraph.valueOf("[fare-7 aux> potrebbe-6 nsubj> [prof-3 det> Il-2 nmod> [Fotticchia-5 punct> .-4] punct> .-4] obj> [gag-9 det> una-8] obl> [situazione-12 case> su-10 det> la-11]]", Language.UniversalEnglish);
+    assertEquals(expected, newSG);
+  }
+
   /**
    * The AddDep should update the matches in the SemgrexMatcher.
    * If that isn't done correctly, then moving the words first