Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a two-word MergeNodes operation.
MergeNodes operation allows for arbitrary attributes on the new word, or makes a best effort at putting a combined word & lemma on the new combined word if nothing was specified. Would like to extend it to handle multiple words at once
- Loading branch information
1 parent
db0bd45
commit 0660fa9
Showing
4 changed files
with
323 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
159 changes: 159 additions & 0 deletions
159
src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/MergeNodes.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
package edu.stanford.nlp.semgraph.semgrex.ssurgeon; | ||
|
||
import java.io.StringWriter; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.TreeMap; | ||
|
||
import edu.stanford.nlp.ling.CoreAnnotations; | ||
import edu.stanford.nlp.ling.CoreLabel; | ||
import edu.stanford.nlp.ling.IndexedWord; | ||
import edu.stanford.nlp.semgraph.SemanticGraph; | ||
import edu.stanford.nlp.semgraph.SemanticGraphEdge; | ||
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher; | ||
|
||
/** | ||
* Combines two words into one word | ||
*<br> | ||
* This requires one of the nodes to be the head of a phrase of the words, | ||
* and the dependent words can't have any extra edges in or out of that subgraph | ||
*<br> | ||
* The word and lemma will be the combination of the words, squished together. | ||
* Before and after will be updated to use the before and after of the endpoints of the subgraph | ||
* | ||
* @author John Bauer | ||
*/ | ||
public class MergeNodes extends SsurgeonEdit { | ||
public static final String LABEL = "mergeNodes"; | ||
final String name1; | ||
final String name2; | ||
final Map<String, String> attributes; | ||
|
||
public MergeNodes(String name1, String name2, Map<String, String> attributes) { | ||
this.name1 = name1; | ||
this.name2 = name2; | ||
this.attributes = new TreeMap<>(attributes); | ||
} | ||
|
||
/** | ||
* Emits a parseable instruction string. | ||
*/ | ||
@Override | ||
public String toEditString() { | ||
StringWriter buf = new StringWriter(); | ||
buf.write(LABEL); buf.write("\t"); | ||
buf.write(name1); buf.write("\t"); | ||
buf.write(name2); | ||
|
||
// TODO: some attributes might need to be escaped! | ||
for (String key : attributes.keySet()) { | ||
buf.write("\t-"); | ||
buf.write(key); | ||
buf.write(" "); | ||
buf.write(attributes.get(key)); | ||
} | ||
|
||
return buf.toString(); | ||
} | ||
|
||
/** | ||
* If the two named nodes are next to each other, and the edges of | ||
* the graph allow for it, squish the two words into one word | ||
*/ | ||
@Override | ||
public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) { | ||
IndexedWord node1 = sm.getNode(name1); | ||
IndexedWord node2 = sm.getNode(name2); | ||
|
||
if (node1 == null || node2 == null) { | ||
return false; | ||
} | ||
|
||
List<SemanticGraphEdge> n1_to_n2 = sg.getAllEdges(node1, node2); | ||
List<SemanticGraphEdge> n2_to_n1 = sg.getAllEdges(node2, node1); | ||
if (n1_to_n2.size() == 0 && n2_to_n1.size() == 0) { | ||
return false; | ||
} | ||
|
||
// TODO: what about the case where the dep is or has copies? | ||
final IndexedWord head; | ||
final IndexedWord dep; | ||
|
||
if (n1_to_n2.size() > 0) { | ||
head = node1; | ||
dep = node2; | ||
} else { | ||
head = node2; | ||
dep = node1; | ||
} | ||
|
||
// If the dep has any edges that aren't between dep & head, abort | ||
// TODO: we could probably make it adjust edges with "dep" as source, instead | ||
for (SemanticGraphEdge e : sg.outgoingEdgeIterable(dep)) { | ||
if (e.getTarget() != head) { | ||
return false; | ||
} | ||
} | ||
for (SemanticGraphEdge e : sg.incomingEdgeIterable(dep)) { | ||
if (e.getSource() != head) { | ||
return false; | ||
} | ||
} | ||
|
||
IndexedWord left; | ||
IndexedWord right; | ||
if (node1.index() < node2.index()) { | ||
left = node1; | ||
right = node2; | ||
} else { | ||
left = node2; | ||
right = node1; | ||
} | ||
|
||
CoreLabel newLabel = AddDep.fromCheapStrings(attributes); | ||
if (newLabel.word() == null) { | ||
String newWord = left.word() + right.word(); | ||
newLabel.setWord(newWord); | ||
} | ||
if (newLabel.value() == null) { | ||
newLabel.setValue(newLabel.word()); | ||
} | ||
if (newLabel.lemma() == null) { | ||
String newLemma = left.lemma() != null && right.lemma() != null ? left.lemma() + right.lemma() : null; | ||
newLabel.setLemma(newLemma); | ||
} | ||
// after() and before() return "" if null, so we need to use the CoreAnnotations directly | ||
if (newLabel.get(CoreAnnotations.AfterAnnotation.class) == null) { | ||
newLabel.setAfter(right.after()); | ||
} | ||
if (newLabel.get(CoreAnnotations.BeforeAnnotation.class) == null) { | ||
newLabel.setBefore(right.before()); | ||
} | ||
|
||
for (IndexedWord vertex : sg.vertexSet()) { | ||
if (vertex.index() == head.index()) { | ||
for (Class key : newLabel.keySet()) { | ||
Object value = newLabel.get(key); | ||
vertex.set(key, value); | ||
} | ||
} | ||
} | ||
|
||
// copy the list so that deletion doesn't hurt the iterator | ||
// TODO: super fancy would be implementing iterator.remove() | ||
// on the Set returned by the SemanticGraph | ||
for (IndexedWord vertex : sg.vertexListSorted()) { | ||
if (vertex.index() == dep.index()) { | ||
sg.removeVertex(vertex); | ||
} | ||
} | ||
|
||
// reindex everyone | ||
AddDep.moveNodes(sg, sm, x -> (x >= dep.index()), x -> x-1, false); | ||
|
||
return true; | ||
} | ||
|
||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters