From 819a6e0864a8ac3f796d4b5aaf5b49734745753d Mon Sep 17 00:00:00 2001 From: John Bauer Date: Wed, 26 Feb 2025 12:53:20 -0800 Subject: [PATCH 1/3] Add a node containment option to semgrex that works on CoreAnnotations which are Maps. Currently the syntax is @, subject to change eg, morphofeatures@foo=bar In this expression, bar can be a regex, but foo and morphofeatures cannot. It might be worth adding regex capabilities for both of those, Also, !@ would be a useful addition. This checks at Semgrex compile time (not Java compile time) that the annotation used for key/value is actually a Map Has a test that the error checking and a simple search both work. --- .../nlp/semgraph/semgrex/NodeAttributes.java | 16 +++ .../nlp/semgraph/semgrex/NodePattern.java | 101 ++++++++++++++--- .../nlp/semgraph/semgrex/SemgrexParser.java | 104 ++++++++++++------ .../nlp/semgraph/semgrex/SemgrexParser.jj | 25 +++-- .../nlp/semgraph/semgrex/SemgrexTest.java | 27 +++++ 5 files changed, 215 insertions(+), 58 deletions(-) diff --git a/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java b/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java index 6891f7556c..542e2219dc 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java @@ -26,12 +26,20 @@ public class NodeAttributes { // String, String, Boolean: key, value, negated private List> attributes; private Set positiveAttributes; + // Some annotations, especially morpho freatures (CoreAnnotations.CoNLLUFeats) + // are represented by Maps. In some cases it will be easier to search + // for individual elements of that map rather than turn the map into a string + // and search on its contents that way. This is especially true since there + // is no guarantee the map will be in a consistent order. + // String, String, String: node attribute for a map (such as CoNLLUFeats), key in that map, value to match + private List> contains; public NodeAttributes() { root = false; empty = false; attributes = new ArrayList<>(); positiveAttributes = new HashSet<>(); + contains = new ArrayList<>(); } public void setRoot(boolean root) { @@ -60,7 +68,15 @@ public void setAttribute(String key, String value, boolean negated) { attributes.add(new Triple(key, value, negated)); } + public void addContains(String annotation, String key, String value) { + contains.add(new Triple(annotation, key, value)); + } + public List> attributes() { return Collections.unmodifiableList(attributes); } + + public List> contains() { + return Collections.unmodifiableList(contains); + } } diff --git a/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java b/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java index 5f0eab72a1..8a841fc9a8 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java @@ -8,6 +8,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import edu.stanford.nlp.ling.AnnotationLookup; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphEdge; @@ -32,6 +33,11 @@ public class NodePattern extends SemgrexPattern { * Otherwise, the type will be a Pattern, and you must use Pattern.matches(). */ private final List attributes; + /** + * Attributes which represent Maps (eg CoNLLUFeats) + * and only partial matches are necessary + */ + private final List> partialAttributes; private final boolean isRoot; private final boolean isLink; private final boolean isEmpty; @@ -58,6 +64,9 @@ public NodePattern(GraphRelation r, boolean negDesc, // order the attributes so that the pattern stays the same when // printing a compiled pattern this.attributes = new ArrayList<>(); + // same with partial attributes + this.partialAttributes = new ArrayList<>(); + descString = "{"; for (Triple entry : attrs.attributes()) { if (!descString.equals("{")) @@ -70,23 +79,7 @@ public NodePattern(GraphRelation r, boolean negDesc, if (value.equals("__")) { attributes.add(new Attribute(key, true, true, negated)); } else if (value.matches("/.*/")) { - boolean isRegexp = false; - for (int i = 1; i < value.length() - 1; ++i) { - char chr = value.charAt(i); - if ( !( (chr >= 'A' && chr <= 'Z') || (chr >= 'a' && chr <= 'z') || (chr >= '0' && chr <= '9') ) ) { - isRegexp = true; - break; - } - } - String patternContent = value.substring(1, value.length() - 1); - if (isRegexp) { - attributes.add(new Attribute(key, - Pattern.compile(patternContent), - Pattern.compile(patternContent, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE), - negated)); - } else { - attributes.add(new Attribute(key, patternContent, patternContent, negated)); - } + attributes.add(buildRegexAttribute(key, value, negated)); } else { // raw description attributes.add(new Attribute(key, value, value, negated)); } @@ -98,6 +91,33 @@ public NodePattern(GraphRelation r, boolean negDesc, } } + for (Triple entry : attrs.contains()) { + String annotation = entry.first(); + String key = entry.second(); + String value = entry.third(); + + Class clazz = AnnotationLookup.getValueType(AnnotationLookup.toCoreKey(annotation)); + boolean isMap = clazz != null && Map.class.isAssignableFrom(clazz); + if (!isMap) { + throw new SemgrexParseException("Cannot process a single key/value from annotation " + annotation + " as it is not a Map"); + } + + final Attribute attr; + // Add the attributes for this key + if (value.equals("__")) { + attr = new Attribute(key, true, true, false); + } else if (value.matches("/.*/")) { + attr = buildRegexAttribute(key, value, false); + } else { // raw description + attr = new Attribute(key, value, value, false); + } + partialAttributes.add(new Pair<>(annotation, attr)); + + if (!descString.equals("{")) + descString += ";"; + descString += (annotation + "@" + key + "=" + value); + } + if (attrs.root()) { if (!descString.equals("{")) descString += ";"; @@ -118,6 +138,30 @@ public NodePattern(GraphRelation r, boolean negDesc, this.variableGroups = Collections.unmodifiableList(variableGroups); } + /** + * Tests the value to see if it's really a regex, or just a string wrapped in regex. + * Return an Attribute which matches this expression + */ + private Attribute buildRegexAttribute(String key, String value, boolean negated) { + boolean isRegexp = false; + for (int i = 1; i < value.length() - 1; ++i) { + char chr = value.charAt(i); + if ( !( (chr >= 'A' && chr <= 'Z') || (chr >= 'a' && chr <= 'z') || (chr >= '0' && chr <= '9') ) ) { + isRegexp = true; + break; + } + } + String patternContent = value.substring(1, value.length() - 1); + if (isRegexp) { + return new Attribute(key, + Pattern.compile(patternContent), + Pattern.compile(patternContent, Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE), + negated); + } else { + return new Attribute(key, patternContent, patternContent, negated); + } + } + private boolean checkMatch(Attribute attr, boolean ignoreCase, String nodeValue) { if (nodeValue == null) { // treat non-existent attributes has having matched a negated expression @@ -189,6 +233,29 @@ public boolean nodeAttrMatch(IndexedWord node, final SemanticGraph sg, boolean i return negDesc; } } + for (Pair partialAttribute : partialAttributes) { + String annotation = partialAttribute.first(); + Attribute attr = partialAttribute.second(); + + Class clazz = Env.lookupAnnotationKey(env, annotation); + Object rawmap = node.get(clazz); + // if the map is null, it can't possibly match... + if (rawmap == null) { + return negDesc; + } + if (!(rawmap instanceof Map)) + throw new RuntimeException("Can only use partial attributes with Maps... this should have been checked at creation time!"); + Map map = (Map) rawmap; + + // TODO: allow for regex match on the keys? + Object value = map.get(attr.key); + final String nodeValue = (value == null) ? null : value.toString(); + boolean matches = checkMatch(attr, ignoreCase, nodeValue); + if (!matches) { + return negDesc; + } + } + // System.out.println("matches"); // System.out.println(""); return !negDesc; diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java index 110925af97..cd5e4b98ac 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java @@ -526,43 +526,79 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th } final public void AddAttribute(NodeAttributes attributes) throws ParseException {Token attr = null; + Token key = null; Token value = null; Token attrType = null; switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case IDENTIFIER:{ attr = jj_consume_token(IDENTIFIER); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { - case 10:{ - attrType = jj_consume_token(10); - break; - } + case 10: case 22:{ - attrType = jj_consume_token(22); - break; + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 10:{ + attrType = jj_consume_token(10); + break; + } + case 22:{ + attrType = jj_consume_token(22); + break; + } + default: + jj_la1[23] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); } - default: - jj_la1[23] = jj_gen; - jj_consume_token(-1); - throw new ParseException(); - } - switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { - case IDENTIFIER:{ - value = jj_consume_token(IDENTIFIER); + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case IDENTIFIER:{ + value = jj_consume_token(IDENTIFIER); + break; + } + case REGEX:{ + value = jj_consume_token(REGEX); + break; + } + default: + jj_la1[24] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } +if (attr != null && value != null) { + boolean negated = attrType.image.equals("!:"); + attributes.setAttribute(attr.image, value.image, negated); + } break; } - case REGEX:{ - value = jj_consume_token(REGEX); + case ALIGNRELN:{ + attrType = jj_consume_token(ALIGNRELN); + key = jj_consume_token(IDENTIFIER); + jj_consume_token(21); + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case IDENTIFIER:{ + value = jj_consume_token(IDENTIFIER); + break; + } + case REGEX:{ + value = jj_consume_token(REGEX); + break; + } + default: + jj_la1[25] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } +if (attr == null || key == null || value == null) { + {if (true) throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr + + " key=" + key + " value=" + value);} + } + attributes.addContains(attr.image, key.image, value.image); break; } default: - jj_la1[24] = jj_gen; + jj_la1[26] = jj_gen; jj_consume_token(-1); throw new ParseException(); } -if (attr != null && value != null) { - boolean negated = attrType.image.equals("!:"); - attributes.setAttribute(attr.image, value.image, negated); - } break; } case ROOT:{ @@ -576,7 +612,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[25] = jj_gen; + jj_la1[27] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -600,7 +636,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[26] = jj_gen; + jj_la1[28] = jj_gen; break label_6; } jj_consume_token(24); @@ -609,7 +645,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[27] = jj_gen; + jj_la1[29] = jj_gen; ; } jj_consume_token(25); @@ -629,7 +665,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[28] = jj_gen; + jj_la1[30] = jj_gen; ; } pat = new NodePattern(r, underNodeNegation, attributes, link, name != null ? name.image : null); @@ -646,13 +682,13 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th public Token jj_nt; private int jj_ntk; private int jj_gen; - final private int[] jj_la1 = new int[29]; + final private int[] jj_la1 = new int[31]; static private int[] jj_la1_0; static { jj_la1_init_0(); } private static void jj_la1_init_0() { - jj_la1_0 = new int[] {0x400,0x828808,0x3801c,0x3801c,0x828800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x828800,0x2000,0x82c000,0x4000,0x828000,0x820000,0x400400,0x110,0xd0,0x1000000,0xd0,0x200000,}; + jj_la1_0 = new int[] {0x400,0x828808,0x3801c,0x3801c,0x828800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x828800,0x2000,0x82c000,0x4000,0x828000,0x820000,0x400400,0x110,0x110,0x400408,0xd0,0x1000000,0xd0,0x200000,}; } /** Constructor with InputStream. */ @@ -666,7 +702,7 @@ public SemgrexParser(java.io.InputStream stream, String encoding) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 29; i++) jj_la1[i] = -1; + for (int i = 0; i < 31; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -680,7 +716,7 @@ public void ReInit(java.io.InputStream stream, String encoding) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 29; i++) jj_la1[i] = -1; + for (int i = 0; i < 31; i++) jj_la1[i] = -1; } /** Constructor. */ @@ -690,7 +726,7 @@ public SemgrexParser(java.io.Reader stream) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 29; i++) jj_la1[i] = -1; + for (int i = 0; i < 31; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -708,7 +744,7 @@ public void ReInit(java.io.Reader stream) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 29; i++) jj_la1[i] = -1; + for (int i = 0; i < 31; i++) jj_la1[i] = -1; } /** Constructor with generated Token Manager. */ @@ -717,7 +753,7 @@ public SemgrexParser(SemgrexParserTokenManager tm) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 29; i++) jj_la1[i] = -1; + for (int i = 0; i < 31; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -726,7 +762,7 @@ public void ReInit(SemgrexParserTokenManager tm) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 29; i++) jj_la1[i] = -1; + for (int i = 0; i < 31; i++) jj_la1[i] = -1; } private Token jj_consume_token(int kind) throws ParseException { @@ -782,7 +818,7 @@ public ParseException generateParseException() { la1tokens[jj_kind] = true; jj_kind = -1; } - for (int i = 0; i < 29; i++) { + for (int i = 0; i < 31; i++) { if (jj_la1[i] == jj_gen) { for (int j = 0; j < 32; j++) { if ((jj_la1_0[i] & (1< (attrType = ":" | attrType = "!:") (value = | value = ) ) - { - if (attr != null && value != null) { - boolean negated = attrType.image.equals("!:"); - attributes.setAttribute(attr.image, value.image, negated); - } - }) + (attr = + (( (attrType = ":" | attrType = "!:") (value = | value = ) { + if (attr != null && value != null) { + boolean negated = attrType.image.equals("!:"); + attributes.setAttribute(attr.image, value.image, negated); + } + }) + | + (attrType = "@") (key = ) "=" (value = | value = ) + { + if (attr == null || key == null || value == null) { + throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr + + " key=" + key + " value=" + value); + } + attributes.addContains(attr.image, key.image, value.image); + }) + ) | ( attr = { attributes.setRoot(true); } ) | diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java index 3c2b785f34..6c4b7e7502 100644 --- a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java +++ b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java @@ -7,10 +7,12 @@ import java.util.Map; import java.util.Set; +import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.IndexedWord; import edu.stanford.nlp.stats.IntCounter; import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations; import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.ud.CoNLLUFeatures; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.semgraph.SemanticGraphFactory; @@ -222,6 +224,31 @@ public void testNegatedRegex() { "ate", "blueberry"); } + public void testBrokenContainsExpression() { + try { + // word is a String, not a Map, so this should throw a parse exception + SemgrexPattern pattern = SemgrexPattern.compile("{word@foo=bar}"); + throw new AssertionError("Expected a SemgrexParseException"); + } catch (SemgrexParseException e) { + // good + } + } + + public void testContainsExpression() { + // morphofeatures is a Map, so this should work + SemgrexPattern pattern = SemgrexPattern.compile("{morphofeatures@foo=bar}"); + SemanticGraph graph = makeComplicatedGraph(); + Set vertices = graph.vertexSet(); + for (IndexedWord iw : vertices) { + if (iw.value().equals("D") || iw.value().equals("F")) { + CoNLLUFeatures feats = new CoNLLUFeatures(); + feats.put("foo", "bar"); + iw.set(CoreAnnotations.CoNLLUFeats.class, feats); + } + } + runTest(pattern, graph, "D", "F"); + } + public void testReferencedRegex() { runTest("{word:/Bill/}", "[ate subj>Bill obj>[bill det>the]]", "Bill"); From 29fa00abe0b79f4732ef23708f0d96343d84633a Mon Sep 17 00:00:00 2001 From: John Bauer Date: Thu, 27 Feb 2025 08:08:18 -0800 Subject: [PATCH 2/3] Add a negative containment to semgrex to match the containment option Need to negate the = in the NodePattern output --- .../nlp/semgraph/semgrex/NodeAttributes.java | 11 +-- .../nlp/semgraph/semgrex/NodePattern.java | 31 ++++---- .../nlp/semgraph/semgrex/SemgrexParser.java | 75 +++++++++++-------- .../nlp/semgraph/semgrex/SemgrexParser.jj | 5 +- .../semgrex/SemgrexParserConstants.java | 1 + .../semgrex/SemgrexParserTokenManager.java | 18 +++-- .../nlp/semgraph/semgrex/SemgrexTest.java | 23 ++++++ 7 files changed, 107 insertions(+), 57 deletions(-) diff --git a/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java b/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java index 542e2219dc..ae9efbd802 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/NodeAttributes.java @@ -6,6 +6,7 @@ import java.util.List; import java.util.Set; +import edu.stanford.nlp.util.Quadruple; import edu.stanford.nlp.util.Triple; /** @@ -31,8 +32,8 @@ public class NodeAttributes { // for individual elements of that map rather than turn the map into a string // and search on its contents that way. This is especially true since there // is no guarantee the map will be in a consistent order. - // String, String, String: node attribute for a map (such as CoNLLUFeats), key in that map, value to match - private List> contains; + // String, String, String, Boolean: node attribute for a map (such as CoNLLUFeats), key in that map, value to match, negated? + private List> contains; public NodeAttributes() { root = false; @@ -68,15 +69,15 @@ public void setAttribute(String key, String value, boolean negated) { attributes.add(new Triple(key, value, negated)); } - public void addContains(String annotation, String key, String value) { - contains.add(new Triple(annotation, key, value)); + public void addContains(String annotation, String key, String value, Boolean negated) { + contains.add(new Quadruple(annotation, key, value, negated)); } public List> attributes() { return Collections.unmodifiableList(attributes); } - public List> contains() { + public List> contains() { return Collections.unmodifiableList(contains); } } diff --git a/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java b/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java index 8a841fc9a8..a0277ce15b 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java @@ -13,6 +13,7 @@ import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphEdge; import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.Quadruple; import edu.stanford.nlp.util.Triple; import edu.stanford.nlp.util.logging.Redwood; @@ -91,10 +92,11 @@ public NodePattern(GraphRelation r, boolean negDesc, } } - for (Triple entry : attrs.contains()) { + for (Quadruple entry : attrs.contains()) { String annotation = entry.first(); String key = entry.second(); String value = entry.third(); + boolean negated = entry.fourth(); Class clazz = AnnotationLookup.getValueType(AnnotationLookup.toCoreKey(annotation)); boolean isMap = clazz != null && Map.class.isAssignableFrom(clazz); @@ -105,17 +107,18 @@ public NodePattern(GraphRelation r, boolean negDesc, final Attribute attr; // Add the attributes for this key if (value.equals("__")) { - attr = new Attribute(key, true, true, false); + attr = new Attribute(key, true, true, negated); } else if (value.matches("/.*/")) { - attr = buildRegexAttribute(key, value, false); + attr = buildRegexAttribute(key, value, negated); } else { // raw description - attr = new Attribute(key, value, value, false); + attr = new Attribute(key, value, value, negated); } partialAttributes.add(new Pair<>(annotation, attr)); if (!descString.equals("{")) descString += ";"; - descString += (annotation + "@" + key + "=" + value); + String separator = negated ? "!=" : "="; + descString += (annotation + "@" + key + separator + value); } if (attrs.root()) { @@ -239,17 +242,19 @@ public boolean nodeAttrMatch(IndexedWord node, final SemanticGraph sg, boolean i Class clazz = Env.lookupAnnotationKey(env, annotation); Object rawmap = node.get(clazz); - // if the map is null, it can't possibly match... + final String nodeValue; if (rawmap == null) { - return negDesc; + nodeValue = null; + } else { + if (!(rawmap instanceof Map)) + throw new RuntimeException("Can only use partial attributes with Maps... this should have been checked at creation time!"); + Map map = (Map) rawmap; + + // TODO: allow for regex match on the keys? + Object value = map.get(attr.key); + nodeValue = (value == null) ? null : value.toString(); } - if (!(rawmap instanceof Map)) - throw new RuntimeException("Can only use partial attributes with Maps... this should have been checked at creation time!"); - Map map = (Map) rawmap; - // TODO: allow for regex match on the keys? - Object value = map.get(attr.key); - final String nodeValue = (value == null) ? null : value.toString(); boolean matches = checkMatch(attr, ignoreCase, nodeValue); if (!matches) { return negDesc; diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java index cd5e4b98ac..0b7e5e8741 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java @@ -65,7 +65,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th case 11: case 15: case 17: - case 23:{ + case 24:{ node = SubNode(GraphRelation.ROOT); children.add(node); label_1: @@ -135,7 +135,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th } case 15: case 17: - case 23:{ + case 24:{ result = ModNode(r); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case RELATION: @@ -397,7 +397,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 15: case 17: - case 23:{ + case 24:{ node = ModNode(reln); break; } @@ -454,7 +454,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th case 14: case 15: case 17: - case 23:{ + case 24:{ ; break; } @@ -485,7 +485,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th boolean startUnderNeg; switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 17: - case 23:{ + case 24:{ child = Child(r); break; } @@ -512,7 +512,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th child = NodeDisj(r); break; } - case 23:{ + case 24:{ child = Description(r); break; } @@ -569,8 +569,22 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th } break; } - case ALIGNRELN:{ - attrType = jj_consume_token(ALIGNRELN); + case ALIGNRELN: + case 23:{ + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case ALIGNRELN:{ + attrType = jj_consume_token(ALIGNRELN); + break; + } + case 23:{ + attrType = jj_consume_token(23); + break; + } + default: + jj_la1[25] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } key = jj_consume_token(IDENTIFIER); jj_consume_token(21); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { @@ -583,7 +597,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[25] = jj_gen; + jj_la1[26] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -591,11 +605,12 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th {if (true) throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr + " key=" + key + " value=" + value);} } - attributes.addContains(attr.image, key.image, value.image); + boolean negated = attrType.image.equals("!@"); + attributes.addContains(attr.image, key.image, value.image, negated); break; } default: - jj_la1[26] = jj_gen; + jj_la1[27] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -612,7 +627,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[27] = jj_gen; + jj_la1[28] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -622,7 +637,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th boolean link = false; NodeAttributes attributes = new NodeAttributes(); NodePattern pat; - jj_consume_token(23); + jj_consume_token(24); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case IDENTIFIER: case EMPTY: @@ -631,24 +646,24 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th label_6: while (true) { switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { - case 24:{ + case 25:{ ; break; } default: - jj_la1[28] = jj_gen; + jj_la1[29] = jj_gen; break label_6; } - jj_consume_token(24); + jj_consume_token(25); AddAttribute(attributes); } break; } default: - jj_la1[29] = jj_gen; + jj_la1[30] = jj_gen; ; } - jj_consume_token(25); + jj_consume_token(26); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 21:{ jj_consume_token(21); @@ -665,7 +680,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[30] = jj_gen; + jj_la1[31] = jj_gen; ; } pat = new NodePattern(r, underNodeNegation, attributes, link, name != null ? name.image : null); @@ -682,13 +697,13 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th public Token jj_nt; private int jj_ntk; private int jj_gen; - final private int[] jj_la1 = new int[31]; + final private int[] jj_la1 = new int[32]; static private int[] jj_la1_0; static { jj_la1_init_0(); } private static void jj_la1_init_0() { - jj_la1_0 = new int[] {0x400,0x828808,0x3801c,0x3801c,0x828800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x828800,0x2000,0x82c000,0x4000,0x828000,0x820000,0x400400,0x110,0x110,0x400408,0xd0,0x1000000,0xd0,0x200000,}; + jj_la1_0 = new int[] {0x400,0x1028808,0x3801c,0x3801c,0x1028800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x1028800,0x2000,0x102c000,0x4000,0x1028000,0x1020000,0x400400,0x110,0x800008,0x110,0xc00408,0xd0,0x2000000,0xd0,0x200000,}; } /** Constructor with InputStream. */ @@ -702,7 +717,7 @@ public SemgrexParser(java.io.InputStream stream, String encoding) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 31; i++) jj_la1[i] = -1; + for (int i = 0; i < 32; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -716,7 +731,7 @@ public void ReInit(java.io.InputStream stream, String encoding) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 31; i++) jj_la1[i] = -1; + for (int i = 0; i < 32; i++) jj_la1[i] = -1; } /** Constructor. */ @@ -726,7 +741,7 @@ public SemgrexParser(java.io.Reader stream) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 31; i++) jj_la1[i] = -1; + for (int i = 0; i < 32; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -744,7 +759,7 @@ public void ReInit(java.io.Reader stream) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 31; i++) jj_la1[i] = -1; + for (int i = 0; i < 32; i++) jj_la1[i] = -1; } /** Constructor with generated Token Manager. */ @@ -753,7 +768,7 @@ public SemgrexParser(SemgrexParserTokenManager tm) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 31; i++) jj_la1[i] = -1; + for (int i = 0; i < 32; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -762,7 +777,7 @@ public void ReInit(SemgrexParserTokenManager tm) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 31; i++) jj_la1[i] = -1; + for (int i = 0; i < 32; i++) jj_la1[i] = -1; } private Token jj_consume_token(int kind) throws ParseException { @@ -813,12 +828,12 @@ private int jj_ntk_f() { /** Generate ParseException. */ public ParseException generateParseException() { jj_expentries.clear(); - boolean[] la1tokens = new boolean[26]; + boolean[] la1tokens = new boolean[27]; if (jj_kind >= 0) { la1tokens[jj_kind] = true; jj_kind = -1; } - for (int i = 0; i < 31; i++) { + for (int i = 0; i < 32; i++) { if (jj_la1[i] == jj_gen) { for (int j = 0; j < 32; j++) { if ((jj_la1_0[i] & (1<) "=" (value = | value = ) + (attrType = "@" | attrType = "!@") (key = ) "=" (value = | value = ) { if (attr == null || key == null || value == null) { throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr + " key=" + key + " value=" + value); } - attributes.addContains(attr.image, key.image, value.image); + boolean negated = attrType.image.equals("!@"); + attributes.addContains(attr.image, key.image, value.image, negated); }) ) | diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java index 7a55891f0c..cad0f272ea 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java @@ -55,6 +55,7 @@ interface SemgrexParserConstants { "\"~\"", "\"=\"", "\"!:\"", + "\"!@\"", "\"{\"", "\";\"", "\"}\"", diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java index e3fe4d9933..4fe38b9910 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java @@ -47,7 +47,7 @@ private int jjMoveStringLiteralDfa0_0(){ return jjStopAtPos(0, 9); case 33: jjmatchedKind = 15; - return jjMoveStringLiteralDfa1_0(0x400000L); + return jjMoveStringLiteralDfa1_0(0xc00000L); case 35: return jjStopAtPos(0, 6); case 36: @@ -63,7 +63,7 @@ private int jjMoveStringLiteralDfa0_0(){ case 58: return jjStopAtPos(0, 10); case 59: - return jjStopAtPos(0, 24); + return jjStopAtPos(0, 25); case 61: return jjStartNfaWithStates_0(0, 21, 2); case 63: @@ -75,11 +75,11 @@ private int jjMoveStringLiteralDfa0_0(){ case 93: return jjStopAtPos(0, 18); case 123: - return jjStopAtPos(0, 23); + return jjStopAtPos(0, 24); case 124: return jjStopAtPos(0, 13); case 125: - return jjStopAtPos(0, 25); + return jjStopAtPos(0, 26); case 126: return jjStopAtPos(0, 20); default : @@ -98,6 +98,10 @@ private int jjMoveStringLiteralDfa1_0(long active0){ if ((active0 & 0x400000L) != 0L) return jjStopAtPos(1, 22); break; + case 64: + if ((active0 & 0x800000L) != 0L) + return jjStopAtPos(1, 23); + break; default : break; } @@ -358,7 +362,7 @@ else if (curChar < 128) public static final String[] jjstrLiteralImages = { "", null, null, "\100", null, null, "\43", "\44", null, "\12", "\72", "\50", "\51", "\174", "\46", "\41", "\77", "\133", "\135", "\54", "\176", "\75", "\41\72", -"\173", "\73", "\175", }; +"\41\100", "\173", "\73", "\175", }; protected Token jjFillToken() { final Token t; @@ -595,10 +599,10 @@ public void SwitchTo(int lexState) /** Lex State array. */ public static final int[] jjnewLexState = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, + -1, -1, }; static final long[] jjtoToken = { - 0x3fffffdL, + 0x7fffffdL, }; static final long[] jjtoSkip = { 0x2L, diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java index 6c4b7e7502..b74d2e3b58 100644 --- a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java +++ b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java @@ -249,6 +249,29 @@ public void testContainsExpression() { runTest(pattern, graph, "D", "F"); } + public void testContainsRegexExpression() { + // morphofeatures is a Map, so this should work + SemanticGraph graph = makeComplicatedGraph(); + Set vertices = graph.vertexSet(); + for (IndexedWord iw : vertices) { + if (iw.value().equals("B") || iw.value().equals("D") || iw.value().equals("F")) { + CoNLLUFeatures feats = new CoNLLUFeatures(); + feats.put("foo", "bar" + iw.value()); + iw.set(CoreAnnotations.CoNLLUFeats.class, feats); + } + } + + // test a positive regex + SemgrexPattern pattern = SemgrexPattern.compile("{morphofeatures@foo=/bar[BD]/}"); + runTest(pattern, graph, "B", "D"); + + // test a negative regex + // should match both the ones that don't have features + // and the ones that have a non-matching feature + pattern = SemgrexPattern.compile("{morphofeatures!@foo=/bar[BD]/}"); + runTest(pattern, graph, "A", "C", "E", "F", "G", "H", "I", "J"); + } + public void testReferencedRegex() { runTest("{word:/Bill/}", "[ate subj>Bill obj>[bill det>the]]", "Bill"); From a350aaa4ed6bdcdfad958f713f3e4659e3f22465 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Wed, 5 Mar 2025 23:53:21 -0800 Subject: [PATCH 3/3] Rewrite the contains syntax to look a bit more like a map. This map allow repeated elements inside the same brackets --- .../nlp/semgraph/semgrex/NodePattern.java | 6 +- .../nlp/semgraph/semgrex/SemgrexParser.java | 119 ++++++++++++------ .../nlp/semgraph/semgrex/SemgrexParser.jj | 18 ++- .../semgrex/SemgrexParserConstants.java | 4 +- .../semgrex/SemgrexParserTokenManager.java | 15 +-- .../nlp/semgraph/semgrex/SemgrexTest.java | 30 ++++- 6 files changed, 139 insertions(+), 53 deletions(-) diff --git a/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java b/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java index a0277ce15b..6f7cb4c9f0 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/NodePattern.java @@ -117,8 +117,10 @@ public NodePattern(GraphRelation r, boolean negDesc, if (!descString.equals("{")) descString += ";"; - String separator = negated ? "!=" : "="; - descString += (annotation + "@" + key + separator + value); + String separator = negated ? "!:" : ":"; + // TODO: the descString might look nicer if multiple contains + // for the same attribute were collapsed into the same map + descString += (annotation + ":{" + key + ":" + value + "}"); } if (attrs.root()) { diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java index 0b7e5e8741..0ebfb5b2d5 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java @@ -65,7 +65,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th case 11: case 15: case 17: - case 24:{ + case 26:{ node = SubNode(GraphRelation.ROOT); children.add(node); label_1: @@ -135,7 +135,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th } case 15: case 17: - case 24:{ + case 26:{ result = ModNode(r); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case RELATION: @@ -397,7 +397,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 15: case 17: - case 24:{ + case 26:{ node = ModNode(reln); break; } @@ -454,7 +454,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th case 14: case 15: case 17: - case 24:{ + case 26:{ ; break; } @@ -485,7 +485,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th boolean startUnderNeg; switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 17: - case 24:{ + case 26:{ child = Child(r); break; } @@ -512,7 +512,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th child = NodeDisj(r); break; } - case 24:{ + case 26:{ child = Description(r); break; } @@ -529,6 +529,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th Token key = null; Token value = null; Token attrType = null; + boolean negated = false; switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case IDENTIFIER:{ attr = jj_consume_token(IDENTIFIER); @@ -564,20 +565,21 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th throw new ParseException(); } if (attr != null && value != null) { - boolean negated = attrType.image.equals("!:"); + negated = attrType.image.equals("!:"); attributes.setAttribute(attr.image, value.image, negated); } break; } - case ALIGNRELN: case 23:{ + jj_consume_token(23); + key = jj_consume_token(IDENTIFIER); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { - case ALIGNRELN:{ - attrType = jj_consume_token(ALIGNRELN); + case 10:{ + attrType = jj_consume_token(10); break; } - case 23:{ - attrType = jj_consume_token(23); + case 22:{ + attrType = jj_consume_token(22); break; } default: @@ -585,8 +587,6 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th jj_consume_token(-1); throw new ParseException(); } - key = jj_consume_token(IDENTIFIER); - jj_consume_token(21); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case IDENTIFIER:{ value = jj_consume_token(IDENTIFIER); @@ -605,12 +605,61 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th {if (true) throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr + " key=" + key + " value=" + value);} } - boolean negated = attrType.image.equals("!@"); + negated = attrType.image.equals("!:"); attributes.addContains(attr.image, key.image, value.image, negated); + label_6: + while (true) { + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 24:{ + ; + break; + } + default: + jj_la1[27] = jj_gen; + break label_6; + } + jj_consume_token(24); + key = jj_consume_token(IDENTIFIER); + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 10:{ + attrType = jj_consume_token(10); + break; + } + case 22:{ + attrType = jj_consume_token(22); + break; + } + default: + jj_la1[28] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case IDENTIFIER:{ + value = jj_consume_token(IDENTIFIER); + break; + } + case REGEX:{ + value = jj_consume_token(REGEX); + break; + } + default: + jj_la1[29] = jj_gen; + jj_consume_token(-1); + throw new ParseException(); + } +if (attr == null || key == null || value == null) { + {if (true) throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr + + " key=" + key + " value=" + value);} + } + negated = attrType.image.equals("!:"); + attributes.addContains(attr.image, key.image, value.image, negated); + } + jj_consume_token(25); break; } default: - jj_la1[27] = jj_gen; + jj_la1[30] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -627,7 +676,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[28] = jj_gen; + jj_la1[31] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -637,33 +686,33 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th boolean link = false; NodeAttributes attributes = new NodeAttributes(); NodePattern pat; - jj_consume_token(24); + jj_consume_token(26); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case IDENTIFIER: case EMPTY: case ROOT:{ AddAttribute(attributes); - label_6: + label_7: while (true) { switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { - case 25:{ + case 24:{ ; break; } default: - jj_la1[29] = jj_gen; - break label_6; + jj_la1[32] = jj_gen; + break label_7; } - jj_consume_token(25); + jj_consume_token(24); AddAttribute(attributes); } break; } default: - jj_la1[30] = jj_gen; + jj_la1[33] = jj_gen; ; } - jj_consume_token(26); + jj_consume_token(25); switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 21:{ jj_consume_token(21); @@ -680,7 +729,7 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th break; } default: - jj_la1[31] = jj_gen; + jj_la1[34] = jj_gen; ; } pat = new NodePattern(r, underNodeNegation, attributes, link, name != null ? name.image : null); @@ -697,13 +746,13 @@ final public SemgrexPattern Root() throws ParseException {// Root pattern for th public Token jj_nt; private int jj_ntk; private int jj_gen; - final private int[] jj_la1 = new int[32]; + final private int[] jj_la1 = new int[35]; static private int[] jj_la1_0; static { jj_la1_init_0(); } private static void jj_la1_init_0() { - jj_la1_0 = new int[] {0x400,0x1028808,0x3801c,0x3801c,0x1028800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x1028800,0x2000,0x102c000,0x4000,0x1028000,0x1020000,0x400400,0x110,0x800008,0x110,0xc00408,0xd0,0x2000000,0xd0,0x200000,}; + jj_la1_0 = new int[] {0x400,0x4028808,0x3801c,0x3801c,0x4028800,0x2000,0x3c01c,0x4000,0x3801c,0x2001c,0x80000,0x10,0x110,0x110,0x100000,0x200000,0x1c,0x4028800,0x2000,0x402c000,0x4000,0x4028000,0x4020000,0x400400,0x110,0x400400,0x110,0x1000000,0x400400,0x110,0xc00400,0xd0,0x1000000,0xd0,0x200000,}; } /** Constructor with InputStream. */ @@ -717,7 +766,7 @@ public SemgrexParser(java.io.InputStream stream, String encoding) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 32; i++) jj_la1[i] = -1; + for (int i = 0; i < 35; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -731,7 +780,7 @@ public void ReInit(java.io.InputStream stream, String encoding) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 32; i++) jj_la1[i] = -1; + for (int i = 0; i < 35; i++) jj_la1[i] = -1; } /** Constructor. */ @@ -741,7 +790,7 @@ public SemgrexParser(java.io.Reader stream) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 32; i++) jj_la1[i] = -1; + for (int i = 0; i < 35; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -759,7 +808,7 @@ public void ReInit(java.io.Reader stream) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 32; i++) jj_la1[i] = -1; + for (int i = 0; i < 35; i++) jj_la1[i] = -1; } /** Constructor with generated Token Manager. */ @@ -768,7 +817,7 @@ public SemgrexParser(SemgrexParserTokenManager tm) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 32; i++) jj_la1[i] = -1; + for (int i = 0; i < 35; i++) jj_la1[i] = -1; } /** Reinitialise. */ @@ -777,7 +826,7 @@ public void ReInit(SemgrexParserTokenManager tm) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 32; i++) jj_la1[i] = -1; + for (int i = 0; i < 35; i++) jj_la1[i] = -1; } private Token jj_consume_token(int kind) throws ParseException { @@ -833,7 +882,7 @@ public ParseException generateParseException() { la1tokens[jj_kind] = true; jj_kind = -1; } - for (int i = 0; i < 32; i++) { + for (int i = 0; i < 35; i++) { if (jj_la1[i] == jj_gen) { for (int j = 0; j < 32; j++) { if ((jj_la1_0[i] & (1< (( (attrType = ":" | attrType = "!:") (value = | value = ) { if (attr != null && value != null) { - boolean negated = attrType.image.equals("!:"); + negated = attrType.image.equals("!:"); attributes.setAttribute(attr.image, value.image, negated); } }) | - (attrType = "@" | attrType = "!@") (key = ) "=" (value = | value = ) + ( ":{" + ((key = ) (attrType = ":" | attrType = "!:") (value = | value = ) { if (attr == null || key == null || value == null) { throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr + " key=" + key + " value=" + value); } - boolean negated = attrType.image.equals("!@"); + negated = attrType.image.equals("!:"); attributes.addContains(attr.image, key.image, value.image, negated); }) + ( ";" (key = ) (attrType = ":" | attrType = "!:") (value = | value = ) + { + if (attr == null || key == null || value == null) { + throw new SemgrexParseException("null while parsing semgrex expression: attr=" + attr + + " key=" + key + " value=" + value); + } + negated = attrType.image.equals("!:"); + attributes.addContains(attr.image, key.image, value.image, negated); + })* + "}" )) ) | ( attr = { attributes.setRoot(true); } ) diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java index cad0f272ea..891073b9ff 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserConstants.java @@ -55,10 +55,10 @@ interface SemgrexParserConstants { "\"~\"", "\"=\"", "\"!:\"", - "\"!@\"", - "\"{\"", + "\":{\"", "\";\"", "\"}\"", + "\"{\"", }; } diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java index 4fe38b9910..4433fbc369 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java @@ -47,7 +47,7 @@ private int jjMoveStringLiteralDfa0_0(){ return jjStopAtPos(0, 9); case 33: jjmatchedKind = 15; - return jjMoveStringLiteralDfa1_0(0xc00000L); + return jjMoveStringLiteralDfa1_0(0x400000L); case 35: return jjStopAtPos(0, 6); case 36: @@ -61,9 +61,10 @@ private int jjMoveStringLiteralDfa0_0(){ case 44: return jjStopAtPos(0, 19); case 58: - return jjStopAtPos(0, 10); + jjmatchedKind = 10; + return jjMoveStringLiteralDfa1_0(0x800000L); case 59: - return jjStopAtPos(0, 25); + return jjStopAtPos(0, 24); case 61: return jjStartNfaWithStates_0(0, 21, 2); case 63: @@ -75,11 +76,11 @@ private int jjMoveStringLiteralDfa0_0(){ case 93: return jjStopAtPos(0, 18); case 123: - return jjStopAtPos(0, 24); + return jjStopAtPos(0, 26); case 124: return jjStopAtPos(0, 13); case 125: - return jjStopAtPos(0, 26); + return jjStopAtPos(0, 25); case 126: return jjStopAtPos(0, 20); default : @@ -98,7 +99,7 @@ private int jjMoveStringLiteralDfa1_0(long active0){ if ((active0 & 0x400000L) != 0L) return jjStopAtPos(1, 22); break; - case 64: + case 123: if ((active0 & 0x800000L) != 0L) return jjStopAtPos(1, 23); break; @@ -362,7 +363,7 @@ else if (curChar < 128) public static final String[] jjstrLiteralImages = { "", null, null, "\100", null, null, "\43", "\44", null, "\12", "\72", "\50", "\51", "\174", "\46", "\41", "\77", "\133", "\135", "\54", "\176", "\75", "\41\72", -"\41\100", "\173", "\73", "\175", }; +"\72\173", "\73", "\175", "\173", }; protected Token jjFillToken() { final Token t; diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java index b74d2e3b58..a43ab168f1 100644 --- a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java +++ b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexTest.java @@ -227,7 +227,7 @@ public void testNegatedRegex() { public void testBrokenContainsExpression() { try { // word is a String, not a Map, so this should throw a parse exception - SemgrexPattern pattern = SemgrexPattern.compile("{word@foo=bar}"); + SemgrexPattern pattern = SemgrexPattern.compile("{word{foo=bar}}"); throw new AssertionError("Expected a SemgrexParseException"); } catch (SemgrexParseException e) { // good @@ -236,7 +236,7 @@ public void testBrokenContainsExpression() { public void testContainsExpression() { // morphofeatures is a Map, so this should work - SemgrexPattern pattern = SemgrexPattern.compile("{morphofeatures@foo=bar}"); + SemgrexPattern pattern = SemgrexPattern.compile("{morphofeatures:{foo:bar}}"); SemanticGraph graph = makeComplicatedGraph(); Set vertices = graph.vertexSet(); for (IndexedWord iw : vertices) { @@ -262,16 +262,38 @@ public void testContainsRegexExpression() { } // test a positive regex - SemgrexPattern pattern = SemgrexPattern.compile("{morphofeatures@foo=/bar[BD]/}"); + SemgrexPattern pattern = SemgrexPattern.compile("{morphofeatures:{foo:/bar[BD]/}}"); runTest(pattern, graph, "B", "D"); // test a negative regex // should match both the ones that don't have features // and the ones that have a non-matching feature - pattern = SemgrexPattern.compile("{morphofeatures!@foo=/bar[BD]/}"); + pattern = SemgrexPattern.compile("{morphofeatures:{foo!:/bar[BD]/}}"); runTest(pattern, graph, "A", "C", "E", "F", "G", "H", "I", "J"); } + public void testDoubleContainsExpression() { + // morphofeatures is a Map, so this should work + SemanticGraph graph = makeComplicatedGraph(); + Set vertices = graph.vertexSet(); + for (IndexedWord iw : vertices) { + if (iw.value().equals("B") || iw.value().equals("D") || iw.value().equals("F")) { + CoNLLUFeatures feats = new CoNLLUFeatures(); + feats.put("foo", "bar"); + feats.put("name", iw.value()); + iw.set(CoreAnnotations.CoNLLUFeats.class, feats); + } + } + + // test a positive regex + SemgrexPattern pattern = SemgrexPattern.compile("{morphofeatures:{foo:/bar/;name:/[BD]/}}"); + runTest(pattern, graph, "B", "D"); + + // test one positive, one negative regex + pattern = SemgrexPattern.compile("{morphofeatures:{foo:/bar/;name!:/[BD]/}}"); + runTest(pattern, graph, "F"); + } + public void testReferencedRegex() { runTest("{word:/Bill/}", "[ate subj>Bill obj>[bill det>the]]", "Bill");