From c0c635ec721dec3c3d95a23e3da69f47084d29ef Mon Sep 17 00:00:00 2001 From: Christopher Manning Date: Sat, 10 Oct 2015 15:33:20 -0700 Subject: [PATCH] Revert "Minor cleanup to old CMMClassifier - final, etc." --- doc/loglinear/QUICKSTART.txt | 2 +- doc/loglinear/README.txt | 2 +- .../qe/QuantifiableEntityExtractorITest.java | 135 -- .../TokenSequenceMatcherITest.java | 132 +- .../parser/nndep/DependencyParserITest.java | 4 +- .../pipeline/AnnotationOutputterITest.java | 46 +- .../EntityMentionsAnnotatorITest.java | 2 +- ...ProtobufAnnotationSerializerSlowITest.java | 8 +- .../TokensRegexNERAnnotatorITest.java | 103 +- liblocal/README | 153 -- ...lr-runtime.jar => antlr-runtime-3.1.2.jar} | Bin ...amcrest-core.jar => hamcrest-core-1.3.jar} | Bin .../{javaruntype.jar => javaruntype-1.2.jar} | Bin ...r => junit-quickcheck-core-0.4-beta-3.jar} | Bin ...unit-quickcheck-generators-0.4-beta-3.jar} | Bin ...t-theories.jar => junit-theories-4.12.jar} | Bin liblocal/{ognl.jar => ognl-3.0.5.jar} | Bin .../LogConditionalObjectiveFunction.java | 7 +- src/edu/stanford/nlp/ie/NumberNormalizer.java | 4 +- .../CRFLogConditionalObjectiveFunction.java | 12 + .../stanford/nlp/ie/ner/CMMClassifier.java | 76 +- src/edu/stanford/nlp/ling/CoreLabel.java | 9 +- .../tokensregex/BasicSequenceMatchResult.java | 13 +- .../ling/tokensregex/ComplexNodePattern.java | 466 ------ .../ling/tokensregex/CoreMapNodePattern.java | 421 ++++- .../CoreMapNodePatternTrigger.java | 2 +- .../tokensregex/CoreMapSequenceMatcher.java | 4 +- .../stanford/nlp/ling/tokensregex/Env.java | 23 - .../nlp/ling/tokensregex/EnvLookup.java | 12 - .../nlp/ling/tokensregex/MapNodePattern.java | 37 - .../ling/tokensregex/MatchedExpression.java | 30 +- .../tokensregex/MultiCoreMapNodePattern.java | 2 +- .../ling/tokensregex/MultiNodePattern.java | 10 +- .../nlp/ling/tokensregex/PhraseTable.java | 2 +- .../ling/tokensregex/SequenceMatchResult.java | 12 +- .../ling/tokensregex/SequenceMatchRules.java | 153 +- .../nlp/ling/tokensregex/SequencePattern.java | 54 +- .../parser/TokenSequenceParser.java | 1117 +++++++------ .../tokensregex/parser/TokenSequenceParser.jj | 20 +- .../parser/TokenSequenceParserConstants.java | 2 - .../TokenSequenceParserTokenManager.java | 127 +- .../nlp/loglinear/inference/CliqueTree.java | 1400 ++++++++--------- .../nlp/loglinear/inference/TableFactor.java | 1182 +++++++------- .../loglinear/model/ConcatVectorTable.java | 268 ++-- .../nlp/loglinear/model/NDArrayDoubles.java | 290 ++-- .../ClauseSplitterSearchProblem.java | 7 +- .../naturalli/RelationTripleSegmenter.java | 19 +- .../nlp/pipeline/ChunkAnnotationUtils.java | 52 +- .../nlp/pipeline/CoreMapAggregator.java | 20 +- .../pipeline/CoreMapAttributeAggregator.java | 19 +- .../stanford/nlp/pipeline/JSONOutputter.java | 10 +- .../ProtobufAnnotationSerializer.java | 2 +- .../nlp/pipeline/SentenceAnnotator.java | 3 - .../nlp/pipeline/StanfordCoreNLPServer.java | 403 +---- .../nlp/pipeline/TokensRegexNERAnnotator.java | 265 +--- .../pipeline/demo/StanfordCoreNlpDemo.java | 34 +- .../nlp/pipeline/demo/corenlp-brat.css | 8 - .../nlp/pipeline/demo/corenlp-brat.html | 62 +- .../nlp/pipeline/demo/corenlp-brat.js | 256 +-- .../stanford/nlp/semgraph/SemanticGraph.java | 72 +- .../nlp/semgraph/semgrex/GraphRelation.java | 363 ++--- .../nlp/semgraph/semgrex/ParseException.java | 4 +- .../semgraph/semgrex/SemgrexBatchParser.java | 108 +- .../nlp/semgraph/semgrex/SemgrexParser.java | 325 ++-- .../nlp/semgraph/semgrex/SemgrexParser.jj | 4 +- .../semgrex/SemgrexParserTokenManager.java | 214 ++- .../nlp/semgraph/semgrex/SemgrexPattern.java | 15 +- .../semgraph/semgrex/SimpleCharStream.java | 11 +- .../stanford/nlp/semgraph/semgrex/Token.java | 4 +- .../nlp/semgraph/semgrex/TokenMgrError.java | 4 +- .../semgrex/ssurgeon/CollapseSubtree.java | 96 -- .../semgraph/semgrex/ssurgeon/Ssurgeon.java | 126 +- .../semgrex/ssurgeon/SsurgeonUtils.java | 9 - .../sequences/DocumentReaderAndWriter.java | 6 +- src/edu/stanford/nlp/time/TimeExpression.java | 3 +- src/edu/stanford/nlp/time/TimeFormatter.java | 16 +- .../UniversalEnglishGrammaticalRelations.java | 2 +- .../SunJurafskyChineseHeadFinder.java | 33 +- src/edu/stanford/nlp/util/HasInterval.java | 15 +- .../SequencePatternTriggerTest.java | 180 --- .../loglinear/learning}/CoNLLBenchmark.java | 5 +- .../nlp/pipeline/JSONOutputterTest.java | 32 +- .../semgraph/semgrex/SemgrexPatternTest.java | 107 +- 83 files changed, 3498 insertions(+), 5758 deletions(-) delete mode 100644 itest/src/edu/stanford/nlp/ie/qe/QuantifiableEntityExtractorITest.java delete mode 100644 liblocal/README rename liblocal/{antlr-runtime.jar => antlr-runtime-3.1.2.jar} (100%) rename liblocal/{hamcrest-core.jar => hamcrest-core-1.3.jar} (100%) rename liblocal/{javaruntype.jar => javaruntype-1.2.jar} (100%) rename liblocal/{junit-quickcheck-core.jar => junit-quickcheck-core-0.4-beta-3.jar} (100%) rename liblocal/{junit-quickcheck-generators.jar => junit-quickcheck-generators-0.4-beta-3.jar} (100%) rename liblocal/{junit-theories.jar => junit-theories-4.12.jar} (100%) rename liblocal/{ognl.jar => ognl-3.0.5.jar} (100%) delete mode 100644 src/edu/stanford/nlp/ling/tokensregex/ComplexNodePattern.java delete mode 100644 src/edu/stanford/nlp/ling/tokensregex/MapNodePattern.java delete mode 100644 src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/CollapseSubtree.java delete mode 100644 src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonUtils.java delete mode 100644 test/src/edu/stanford/nlp/ling/tokensregex/SequencePatternTriggerTest.java rename {itest/src/edu/stanford/nlp/loglinear => test/src/edu/stanford/nlp/loglinear/learning}/CoNLLBenchmark.java (98%) diff --git a/doc/loglinear/QUICKSTART.txt b/doc/loglinear/QUICKSTART.txt index b6fc418fea..13ec8097ed 100644 --- a/doc/loglinear/QUICKSTART.txt +++ b/doc/loglinear/QUICKSTART.txt @@ -2,7 +2,7 @@ loglinear package quickstart: First, read the ConcatVector section in ARCH.txt. -To jump straight into working code, go read generateSentenceModel() in edu.stanford.nlp.loglinear.CoNLLBenchmark. +To jump straight into working code, go read generateSentenceModel() in edu.stanford.nlp.loglinear.learning.CoNLLBenchmark. ##################################################### diff --git a/doc/loglinear/README.txt b/doc/loglinear/README.txt index 010a707b09..e30285a257 100644 --- a/doc/loglinear/README.txt +++ b/doc/loglinear/README.txt @@ -1,6 +1,6 @@ For an explanation of how everything fits together, see ARCH.txt -For a quick runnable object, go run edu.stanford.nlp.loglinear.CoNLLBenchmark in core's test package. +For a quick runnable object, go run edu.stanford.nlp.loglinear.learning.CoNLLBenchmark in core's test package. For a tutorial, see QUICKSTART.txt diff --git a/itest/src/edu/stanford/nlp/ie/qe/QuantifiableEntityExtractorITest.java b/itest/src/edu/stanford/nlp/ie/qe/QuantifiableEntityExtractorITest.java deleted file mode 100644 index e7e2148bb5..0000000000 --- a/itest/src/edu/stanford/nlp/ie/qe/QuantifiableEntityExtractorITest.java +++ /dev/null @@ -1,135 +0,0 @@ -package edu.stanford.nlp.ie.qe; - -import edu.stanford.nlp.ling.tokensregex.MatchedExpression; -import edu.stanford.nlp.pipeline.*; -import junit.framework.TestCase; - -import java.util.List; - -/** - * Test for quantifiable entity extractor - * @author Angel Chang - */ -public class QuantifiableEntityExtractorITest extends TestCase { - static AnnotationPipeline pipeline = null; - static QuantifiableEntityExtractor extractor = null; - - public void test() throws Exception { - // TODO: Enable tests after rules files are added to models - } - - @Override - public void setUp() throws Exception { - synchronized(QuantifiableEntityExtractorITest.class) { - if (pipeline == null) { - pipeline = new AnnotationPipeline(); - pipeline.addAnnotator(new TokenizerAnnotator(false, "en")); - pipeline.addAnnotator(new WordsToSentencesAnnotator(false)); - pipeline.addAnnotator(new POSTaggerAnnotator(DefaultPaths.DEFAULT_POS_MODEL, false)); - //pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false)); - } - extractor = new QuantifiableEntityExtractor(); - //extractor.init(new Options()); - } - } - - protected static Annotation createDocument(String text) { - Annotation annotation = new Annotation(text); - pipeline.annotate(annotation); - return annotation; - } - - public static class ExpectedQuantity { - String text; - String normalizedValue; - String type; - - public ExpectedQuantity(String text, String normalizedValue, String type) { - this.text = text; - this.normalizedValue = normalizedValue; - this.type = type; - } - } - - public void runAndCheck(String prefix, String[] sentences, ExpectedQuantity[][] expected) throws Exception { - for (int si = 0; si < sentences.length; si++) { - String sentence = sentences[si]; - Annotation annotation = createDocument(sentence); - List matchedExpressions = extractor.extract(annotation); - - // Print out matched text and value - if (expected == null) { - for (int i = 0; i < matchedExpressions.size(); i++) { - String text = matchedExpressions.get(i).getText(); - Object value = matchedExpressions.get(i).getValue(); - System.out.println(prefix + ": Got expression " + text + " with value " + value); - } - assertTrue(prefix + ": No expected provided", false); - } else { - int minMatchable = Math.min(expected[si].length, matchedExpressions.size()); - for (int i = 0; i < minMatchable; i++) { - ExpectedQuantity expectedQuantity = expected[si][i]; - MatchedExpression matched = matchedExpressions.get(i); - SimpleQuantifiableEntity actualQuantity = (SimpleQuantifiableEntity) matched.getValue().get(); - assertEquals(prefix + ".matched." + si + "." + i + ".text", expectedQuantity.text, matched.getText()); - assertEquals(prefix + ".matched." + si + "." + i + ".normalizedValue", expectedQuantity.normalizedValue, actualQuantity.toString()); - assertEquals(prefix + ".matched." + si + "." + i + ".type", expectedQuantity.type, actualQuantity.getUnit().type); - } - assertEquals(prefix + ".length." + si, expected[si].length, matchedExpressions.size()); - } - } - } - - public void _testMoney() throws Exception { - String[] sentences = { - "I have 1 dollar and 2 cents.", - "It cost 10 thousand million dollars." - }; - // TODO: merge the 1 dollar and 2 cents - ExpectedQuantity[][] expected = { - {new ExpectedQuantity("1 dollar", "$1.00", "MONEY"), new ExpectedQuantity("2 cents", "$0.02", "MONEY")}, - {new ExpectedQuantity("10 thousand million dollars", "$10000000000.00", "MONEY")} - }; - - runAndCheck("testMoney", sentences, expected); - } - - public void _testLength() throws Exception { - String[] sentences = { - "We are 2 kilometer away.", - "We are 2 kilometers away.", - "We turn after 5 miles.", - "The box is 100 centimeters tall.", - "The box is 10cm wide.", - "The box is over 1000 mm long.", - "The box is 2ft long." - }; - ExpectedQuantity[][] expected = { - {new ExpectedQuantity("2 kilometer", "2000.0m", "LENGTH")}, - {new ExpectedQuantity("2 kilometers", "2000.0m", "LENGTH")}, - {new ExpectedQuantity("5 miles", "5.0mi", "LENGTH")}, - {new ExpectedQuantity("100 centimeters", "1.0m", "LENGTH")}, - {new ExpectedQuantity("10cm", "0.1m", "LENGTH")}, - {new ExpectedQuantity("1000 mm", "1.0m", "LENGTH")}, - {new ExpectedQuantity("2ft", "2.0'", "LENGTH")} - }; - runAndCheck("testLength", sentences, expected); - } - - // We do weight instead of mass since in typical natural language - // kilograms are used to refer to weight vs mass (in scientific usage) - public void _testWeight() throws Exception { - String[] sentences = { - "The ball is 2 kilograms in weight.", - "There are five grams.", - "How much is seven pounds?" - }; - ExpectedQuantity[][] expected = { - {new ExpectedQuantity("2 kilograms", "2.0kg", "WEIGHT")}, - {new ExpectedQuantity("five grams", "0.005kg", "WEIGHT")}, - {new ExpectedQuantity("seven pounds", "7.0lb", "WEIGHT")} - }; - runAndCheck("testWeight", sentences, expected); - } - -} diff --git a/itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java b/itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java index 34d3e26235..ef81bedc47 100644 --- a/itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java +++ b/itest/src/edu/stanford/nlp/ling/tokensregex/TokenSequenceMatcherITest.java @@ -1,21 +1,21 @@ package edu.stanford.nlp.ling.tokensregex; +import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotations; -import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.*; -import edu.stanford.nlp.process.CoreLabelTokenFactory; -import edu.stanford.nlp.process.PTBTokenizer; -import edu.stanford.nlp.process.TokenizerFactory; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.Timing; import junit.framework.TestCase; +import java.io.File; import java.io.IOException; -import java.io.StringReader; import java.util.ArrayList; +import java.util.Arrays; import java.util.Iterator; import java.util.List; +import java.util.Map; import java.util.regex.Pattern; public class TokenSequenceMatcherITest extends TestCase { @@ -94,50 +94,6 @@ public void testTokenSequenceMatcherValue() throws IOException { assertFalse(match); } - public void testTokenSequenceMatcherBeginEnd() throws IOException { - CoreMap doc = createDocument(testText); - - // Test simple sequence with begin sequence matching - TokenSequencePattern p = TokenSequencePattern.compile("^ [] []"); - TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class)); - - boolean match = m.find(); - assertTrue(match); - assertEquals("the number", m.group()); - - match = m.find(); - assertFalse(match); - - // Test simple sequence with end sequence matching - p = TokenSequencePattern.compile("[] [] $"); - m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class)); - - match = m.find(); - assertTrue(match); - assertEquals("fifty.", m.group()); - - match = m.find(); - assertFalse(match); - - // Test simple sequence with begin and end sequence matching - p = TokenSequencePattern.compile("^ [] [] $"); - m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class)); - - match = m.find(); - assertFalse(match); - - // Test simple sequence with ^$ in a string regular expression - p = TokenSequencePattern.compile("/^number$/"); - m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class)); - - match = m.find(); - assertTrue(match); - assertEquals("number", m.group()); - - match = m.find(); - assertFalse(match); - } - private static final String testText1 = "Mellitus was the first Bishop of London, the third Archbishop of Canterbury, and a member of the Gregorian mission sent to England to convert the Anglo-Saxons. He arrived in 601 AD, and was consecrated as Bishop of London in 604."; public void testTokenSequenceMatcher1() throws IOException { CoreMap doc = createDocument(testText1); @@ -223,7 +179,7 @@ public void testTokenSequenceMatcher1() throws IOException { match = m.find(); assertTrue(match); assertEquals(0, m.groupCount()); - assertEquals("London in 604.", m.group()); + assertEquals("London in 604 .", m.group()); match = m.find(); assertFalse(match); } @@ -479,31 +435,6 @@ public void testTokenSequenceMatcherConj() throws IOException { assertFalse(match); } - public void testTokenSequenceMatcherConj2() throws IOException { - String content = "The cat is sleeping on the floor."; - String greedyPattern = "(?: ([]* cat []*) & ([]* sleeping []*))"; - - TokenizerFactory tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); - List tokens = tf.getTokenizer(new StringReader(content)).tokenize(); - TokenSequencePattern seqPattern = TokenSequencePattern.compile(greedyPattern); - TokenSequenceMatcher matcher = seqPattern.getMatcher(tokens); - - boolean entireMatch = matcher.matches(); - assertTrue(entireMatch); - - boolean match = matcher.find(); - assertTrue(match); - assertEquals("The cat is sleeping on the floor.", matcher.group()); - - String reluctantPattern = "(?: ([]*? cat []*?) & ([]*? sleeping []*?))"; - TokenSequencePattern seqPattern2 = TokenSequencePattern.compile(reluctantPattern); - TokenSequenceMatcher matcher2 = seqPattern2.getMatcher(tokens); - - match = matcher2.find(); - assertTrue(match); - assertEquals("The cat is sleeping", matcher2.group()); - } - public void testTokenSequenceMatcherConjAll() throws IOException { CoreMap doc = createDocument(testText1); TokenSequencePattern p = TokenSequencePattern.compile( @@ -1048,7 +979,7 @@ public void testTokenSequenceOptimizeOrString() throws IOException { TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class)); boolean match = m.find(); assertTrue(match); - assertEquals("atropine we need to have many many words here but we don't sweating", m.group(0)); + assertEquals("atropine we need to have many many words here but we do n't sweating", m.group(0)); match = m.find(); assertFalse(match); @@ -1074,7 +1005,7 @@ public void testMultiplePatterns() throws IOException { CoreMap doc = createDocument("atropine we need to have many many words here but we don't sweating"); MultiPatternMatcher multiPatternMatcher = TokenSequencePattern.getMultiPatternMatcher(p1, p2); List expected = new ArrayList(); - expected.add("atropine we need to have many many words here but we don't sweating"); + expected.add("atropine we need to have many many words here but we do n't sweating"); Iterator expectedIter = expected.iterator(); Iterable> matches = @@ -1256,7 +1187,7 @@ public void testTokenSequenceMatcherNumber() throws IOException { match = m.find(); assertTrue(match); assertEquals(0, m.groupCount()); - assertEquals("January 3, 2002", m.group()); + assertEquals("January 3 , 2002", m.group()); match = m.find(); assertFalse(match); @@ -1265,7 +1196,7 @@ public void testTokenSequenceMatcherNumber() throws IOException { match = m.find(); assertTrue(match); assertEquals(0, m.groupCount()); - assertEquals("January 3, 2002", m.group()); + assertEquals("January 3 , 2002", m.group()); match = m.find(); assertFalse(match); @@ -1473,32 +1404,6 @@ public void testTokenSequenceMatcherMultiNodePattern() throws IOException { assertFalse(match); } - public void testTokenSequenceMatcherMultiNodePattern2() throws IOException { - CoreMap doc = createDocument("Replace the lamp with model wss.32dc55c3e945384dbc5e533ab711fd24"); - - // Greedy - TokenSequencePattern p = TokenSequencePattern.compile("/model/ ((?m){1,4}/\\w+\\.\\w+/)"); - TokenSequenceMatcher m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class)); - boolean match = m.find(); - assertTrue(match); - assertEquals(1, m.groupCount()); - assertEquals("model wss.32dc55c3e945384dbc5e533ab711fd24", m.group()); - assertEquals("wss.32dc55c3e945384dbc5e533ab711fd24", m.group(1)); - match = m.find(); - assertFalse(match); - - // Reluctant - p = TokenSequencePattern.compile("/model/ ((?m){1,4}?/\\w+\\.\\w+/)"); - m = p.getMatcher(doc.get(CoreAnnotations.TokensAnnotation.class)); - match = m.find(); - assertTrue(match); - assertEquals(1, m.groupCount()); - assertEquals("model wss.32", m.group()); - assertEquals("wss.32", m.group(1)); - match = m.find(); - assertFalse(match); - } - public void testTokenSequenceMatcherBackRef() throws IOException { CoreMap doc = createDocument("A A A A A A A B A A B A C A E A A A A A A A A A A A B A A A"); @@ -1583,18 +1488,17 @@ public void testCompile() { //assertEquals(m.group(), "matching this"); } - public void testBindingCompile(){ - Env env = TokenSequencePattern.getNewEnv(); - env.bind("wordname",CoreAnnotations.TextAnnotation.class); - String s = "[wordname:\"name\"]{1,2}"; - TokenSequencePattern p = TokenSequencePattern.compile(env, s); - } - -// // This does not work!!! -// public void testNoBindingCompile(){ + //This DOES NOT work right now!! +// public void testCompile2(){ // Env env = TokenSequencePattern.getNewEnv(); +// env.bind("wordname",CoreAnnotations.TextAnnotation.class); // String s = "[" + CoreAnnotations.TextAnnotation.class.getName()+":\"name\"]{1,2}"; // TokenSequencePattern p = TokenSequencePattern.compile(env, s); +// for(Map.Entry vars: env.getVariables().entrySet()){ +// if(vars.getValue().equals(CoreAnnotations.TextAnnotation.class)){ +// System.out.println("Found " + vars.getKey() + " binding for " + vars.getValue()); +// } +// } // } public void testCaseInsensitive1(){ diff --git a/itest/src/edu/stanford/nlp/parser/nndep/DependencyParserITest.java b/itest/src/edu/stanford/nlp/parser/nndep/DependencyParserITest.java index 6e2eeda43e..f7bb2f3106 100644 --- a/itest/src/edu/stanford/nlp/parser/nndep/DependencyParserITest.java +++ b/itest/src/edu/stanford/nlp/parser/nndep/DependencyParserITest.java @@ -46,7 +46,7 @@ public void testDependencyParserEnglishSD() { } // Lower because we're evaluating on PTB + extraDevTest, not just PTB - private static final double EnglishUdLas = 88.72648417258083; + private static final double EnglishUdLas = 84.9873; /** * Test that the NN dependency parser performance doesn't change. @@ -54,7 +54,7 @@ public void testDependencyParserEnglishSD() { public void testDependencyParserEnglishUD() { DependencyParser parser = new DependencyParser(); parser.loadModelFile("/u/nlp/data/depparser/nn/distrib-2015-04-16/english_UD.gz"); - double las = parser.testCoNLL("/u/nlp/data/depparser/nn/data/dependency_treebanks/UD-converted/dev.conll", null); + double las = parser.testCoNLL("/u/nlp/data/depparser/nn/data/dependency_treebanks/USD/dev.conll", null); assertEquals(String.format("English UD LAS should be %.2f but was %.2f", EnglishUdLas, las), EnglishUdLas, las, 1e-4); } diff --git a/itest/src/edu/stanford/nlp/pipeline/AnnotationOutputterITest.java b/itest/src/edu/stanford/nlp/pipeline/AnnotationOutputterITest.java index 74b256872e..6b4416437a 100644 --- a/itest/src/edu/stanford/nlp/pipeline/AnnotationOutputterITest.java +++ b/itest/src/edu/stanford/nlp/pipeline/AnnotationOutputterITest.java @@ -43,78 +43,74 @@ public void testSimpleSentenceJSON() throws IOException { "{\n" + " \"sentences\": [\n" + " {\n" + - " \"index\": 0,\n" + + " \"index\": \"0\",\n" + " \"parse\": \"(ROOT (NP (JJ Bad) (NN wolf)))\",\n" + " \"basic-dependencies\": [\n" + " {\n" + " \"dep\": \"ROOT\",\n" + - " \"governor\": 0,\n" + + " \"governor\": \"0\",\n" + " \"governorGloss\": \"ROOT\",\n" + - " \"dependent\": 2,\n" + + " \"dependent\": \"2\",\n" + " \"dependentGloss\": \"wolf\"\n" + " },\n" + " {\n" + " \"dep\": \"amod\",\n" + - " \"governor\": 2,\n" + + " \"governor\": \"2\",\n" + " \"governorGloss\": \"wolf\",\n" + - " \"dependent\": 1,\n" + + " \"dependent\": \"1\",\n" + " \"dependentGloss\": \"Bad\"\n" + " }\n" + " ],\n" + " \"collapsed-dependencies\": [\n" + " {\n" + " \"dep\": \"ROOT\",\n" + - " \"governor\": 0,\n" + + " \"governor\": \"0\",\n" + " \"governorGloss\": \"ROOT\",\n" + - " \"dependent\": 2,\n" + + " \"dependent\": \"2\",\n" + " \"dependentGloss\": \"wolf\"\n" + " },\n" + " {\n" + " \"dep\": \"amod\",\n" + - " \"governor\": 2,\n" + + " \"governor\": \"2\",\n" + " \"governorGloss\": \"wolf\",\n" + - " \"dependent\": 1,\n" + + " \"dependent\": \"1\",\n" + " \"dependentGloss\": \"Bad\"\n" + " }\n" + " ],\n" + " \"collapsed-ccprocessed-dependencies\": [\n" + " {\n" + " \"dep\": \"ROOT\",\n" + - " \"governor\": 0,\n" + + " \"governor\": \"0\",\n" + " \"governorGloss\": \"ROOT\",\n" + - " \"dependent\": 2,\n" + + " \"dependent\": \"2\",\n" + " \"dependentGloss\": \"wolf\"\n" + " },\n" + " {\n" + " \"dep\": \"amod\",\n" + - " \"governor\": 2,\n" + + " \"governor\": \"2\",\n" + " \"governorGloss\": \"wolf\",\n" + - " \"dependent\": 1,\n" + + " \"dependent\": \"1\",\n" + " \"dependentGloss\": \"Bad\"\n" + " }\n" + " ],\n" + " \"tokens\": [\n" + " {\n" + - " \"index\": 1,\n" + + " \"index\": \"1\",\n" + " \"word\": \"Bad\",\n" + " \"lemma\": \"bad\",\n" + - " \"characterOffsetBegin\": 0,\n" + - " \"characterOffsetEnd\": 3,\n" + + " \"characterOffsetBegin\": \"0\",\n" + + " \"characterOffsetEnd\": \"3\",\n" + " \"pos\": \"JJ\",\n" + - " \"ner\": \"O\",\n" + - " \"before\": \"\",\n" + - " \"after\": \" \"\n" + + " \"ner\": \"O\"\n" + " },\n" + " {\n" + - " \"index\": 2,\n" + + " \"index\": \"2\",\n" + " \"word\": \"wolf\",\n" + " \"lemma\": \"wolf\",\n" + - " \"characterOffsetBegin\": 4,\n" + - " \"characterOffsetEnd\": 8,\n" + + " \"characterOffsetBegin\": \"4\",\n" + + " \"characterOffsetEnd\": \"8\",\n" + " \"pos\": \"NN\",\n" + - " \"ner\": \"O\",\n" + - " \"before\": \" \",\n" + - " \"after\": \"\"\n" + + " \"ner\": \"O\"\n" + " }\n" + " ]\n" + " }\n" + diff --git a/itest/src/edu/stanford/nlp/pipeline/EntityMentionsAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/EntityMentionsAnnotatorITest.java index c07193ff03..fc0b0ae02c 100644 --- a/itest/src/edu/stanford/nlp/pipeline/EntityMentionsAnnotatorITest.java +++ b/itest/src/edu/stanford/nlp/pipeline/EntityMentionsAnnotatorITest.java @@ -151,7 +151,7 @@ public void testNewsText() { "[Text=Monday CharacterOffsetBegin=91 CharacterOffsetEnd=97 Tokens=[Monday-18] TokenBegin=17 TokenEnd=18 NamedEntityTag=DATE NormalizedNamedEntityTag=XXXX-WXX-1 EntityType=DATE SentenceIndex=0 Timex=Monday]", "[Text=5,500 CharacterOffsetBegin=123 CharacterOffsetEnd=128 Tokens=[5,500-5] TokenBegin=23 TokenEnd=24 NamedEntityTag=NUMBER NormalizedNamedEntityTag=~5500.0 EntityType=NUMBER SentenceIndex=1]", "[Text=Dickson Poon University of Oxford China Center CharacterOffsetBegin=152 CharacterOffsetEnd=198 Tokens=[Dickson-11, Poon-12, University-13, of-14, Oxford-15, China-16, Center-17] TokenBegin=29 TokenEnd=36 NamedEntityTag=ORGANIZATION EntityType=ORGANIZATION SentenceIndex=1]", - "[Text=St Hugh's College CharacterOffsetBegin=202 CharacterOffsetEnd=219 Tokens=[St-19, Hugh-20, 's-21, College-22] TokenBegin=37 TokenEnd=41 NamedEntityTag=ORGANIZATION EntityType=ORGANIZATION SentenceIndex=1]", + "[Text=St Hugh 's College CharacterOffsetBegin=202 CharacterOffsetEnd=219 Tokens=[St-19, Hugh-20, 's-21, College-22] TokenBegin=37 TokenEnd=41 NamedEntityTag=ORGANIZATION EntityType=ORGANIZATION SentenceIndex=1]", "[Text=21 million pounds CharacterOffsetBegin=231 CharacterOffsetEnd=248 Tokens=[21-25, million-26, pounds-27] TokenBegin=43 TokenEnd=46 NamedEntityTag=MONEY NormalizedNamedEntityTag=~£2.1E7 EntityType=MONEY SentenceIndex=1]", "[Text=Dickson Poon CharacterOffsetBegin=250 CharacterOffsetEnd=262 Tokens=[Dickson-1, Poon-2] TokenBegin=47 TokenEnd=49 NamedEntityTag=PERSON EntityType=PERSON SentenceIndex=2]", "[Text=Hong Kong CharacterOffsetBegin=286 CharacterOffsetEnd=295 Tokens=[Hong-7, Kong-8] TokenBegin=53 TokenEnd=55 NamedEntityTag=LOCATION EntityType=LOCATION SentenceIndex=2]", diff --git a/itest/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializerSlowITest.java b/itest/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializerSlowITest.java index 6b357eea41..6600a590ad 100644 --- a/itest/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializerSlowITest.java +++ b/itest/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializerSlowITest.java @@ -257,7 +257,7 @@ public void testSentiment() { @Test public void testOpenie() { - testAnnotators("tokenize,ssplit,pos,lemma,depparse,natlog,openie"); + testAnnotators("tokenize,ssplit,pos,depparse,natlog,openie"); } @Test @@ -415,12 +415,6 @@ public void testSerializeNatLog() { testAnnotators("tokenize,ssplit,pos,lemma,depparse,natlog"); } - - @Test - public void testGender() { - testAnnotators("tokenize,ssplit,pos,gender"); - } - /** * Is the protobuf annotator "CoreNLP complete?" * That is, does it effectively save every combination of annotators possible? diff --git a/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java index 2b34cb7c59..69191d9e6f 100644 --- a/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java +++ b/itest/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotatorITest.java @@ -1,7 +1,6 @@ package edu.stanford.nlp.pipeline; import edu.stanford.nlp.io.IOUtils; -import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.StringUtils; @@ -48,11 +47,7 @@ protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(Properties p protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(String[][] patterns, boolean ignoreCase) throws Exception { - return getTokensRegexNerAnnotator(new Properties(), patterns, ignoreCase); - } - - protected static TokensRegexNERAnnotator getTokensRegexNerAnnotator(Properties props, String[][] patterns, boolean ignoreCase) throws Exception - { + Properties props = new Properties(); File tempFile = File.createTempFile("tokensregexnertest.patterns", "txt"); tempFile.deleteOnExit(); PrintWriter pw = IOUtils.getPrintWriter(tempFile.getAbsolutePath()); @@ -74,22 +69,14 @@ protected static Annotation createDocument(String text) { /** * Helper method, checks that each token is tagged with the expected NER type. */ - private static void checkNerTags(List tokens, String... tags) { + private static void checkTags(List tokens, String ... tags) { assertEquals(tags.length, tokens.size()); for (int i = 0; i < tags.length; ++i) { - assertEquals("Mismatch for token tag NER " + i + " " + tokens.get(i), + assertEquals("Mismatch for token " + i + " " + tokens.get(i), tags[i], tokens.get(i).get(CoreAnnotations.NamedEntityTagAnnotation.class)); } } - private static void checkTags(List tokens, Class key, String... tags) { - assertEquals(tags.length, tokens.size()); - for (int i = 0; i < tags.length; ++i) { - assertEquals("Mismatch for token tag " + key + " " + i + " " + tokens.get(i), - tags[i], tokens.get(i).get(key)); - } - } - /** * Helper method, re-annotate each token with specified tag */ @@ -115,15 +102,15 @@ public void testTokensRegexSyntax() throws Exception { annotatorCased.annotate(document); List tokens = document.get(CoreAnnotations.TokensAnnotation.class); - checkNerTags(tokens, - "ORGANIZATION", "ORGANIZATION", "ORGANIZATION", "O", "O", "O", "LOCATION", "O"); + checkTags(tokens, + "ORGANIZATION", "ORGANIZATION", "ORGANIZATION", "O", "O", "O", "LOCATION", "O"); reannotate(tokens, CoreAnnotations.NamedEntityTagAnnotation.class, "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O"); annotatorCased.annotate(document); - checkNerTags(tokens, - "SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O"); + checkTags(tokens, + "SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O"); // Try lowercase Annotator annotatorCaseless = getTokensRegexNerAnnotator(regexes, true); @@ -131,14 +118,14 @@ public void testTokensRegexSyntax() throws Exception { str = "university of alaska is located in alaska."; document = createDocument(str); tokens = document.get(CoreAnnotations.TokensAnnotation.class); - checkNerTags(tokens, - "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O"); + checkTags(tokens, + "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O"); annotatorCased.annotate(document); - checkNerTags(tokens, - "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O"); + checkTags(tokens, + "O", "O", "LOCATION", "O", "O", "O", "LOCATION", "O"); annotatorCaseless.annotate(document); - checkNerTags(tokens, - "SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O"); + checkTags(tokens, + "SCHOOL", "SCHOOL", "SCHOOL", "O", "O", "O", "LOCATION", "O"); } // Tests for TokensRegex syntax with match group @@ -154,61 +141,11 @@ public void testTokensRegexMatchGroup() throws Exception { annotatorCased.annotate(document); List tokens = document.get(CoreAnnotations.TokensAnnotation.class); - checkNerTags(tokens, + checkTags(tokens, "O", "O", "MOVIE", "O", "O", "O"); } - // Tests for TokensRegexNer annotator annotating other fields - public void testTokensRegexNormalizedAnnotate() throws Exception { - Properties props = new Properties(); - props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping.header", "pattern,ner,normalized,overwrite,priority,group"); - - String[][] regexes = - new String[][]{ - new String[]{"blue", "COLOR", "B", "", "0"}, - new String[]{"red", "COLOR", "R", "", "0"}, - new String[]{"green", "COLOR", "G", "", "0"} - }; - Annotator annotatorCased = getTokensRegexNerAnnotator(props, regexes, false); - - String str = "These are all colors: blue, red, and green."; - Annotation document = createDocument(str); - annotatorCased.annotate(document); - List tokens = document.get(CoreAnnotations.TokensAnnotation.class); - - checkTags(tokens, CoreAnnotations.TextAnnotation.class, "These", "are", "all", "colors", ":", "blue", ",", "red", ",", "and", "green", "."); - checkTags(tokens, CoreAnnotations.NamedEntityTagAnnotation.class, "O", "O", "O", "O", "O", "COLOR", "O", "COLOR", "O", "O", "COLOR", "O"); - checkTags(tokens, CoreAnnotations.NormalizedNamedEntityTagAnnotation.class, null, null, null, null, null, "B", null, "R", null, null, "G", null); - } - - public static class TestAnnotation implements CoreAnnotation { - public Class getType() { - return String.class; - } - } - - // Tests for TokensRegexNer annotator annotating other fields with custom key mapping - public void testTokensRegexCustomAnnotate() throws Exception { - - Properties props = new Properties(); - props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping.header", "pattern,test,overwrite,priority,group"); - props.setProperty(REGEX_ANNOTATOR_NAME + ".mapping.field.test", "edu.stanford.nlp.pipeline.TokensRegexNERAnnotatorITest$TestAnnotation"); - String[][] regexes = - new String[][]{ - new String[]{"test", "TEST", "", "0"} - }; - Annotator annotatorCased = getTokensRegexNerAnnotator(props, regexes, true); - - String str = "Marking all test as test"; - Annotation document = createDocument(str); - annotatorCased.annotate(document); - List tokens = document.get(CoreAnnotations.TokensAnnotation.class); - - checkTags(tokens, CoreAnnotations.TextAnnotation.class, "Marking", "all", "test", "as", "test"); - checkTags(tokens, TestAnnotation.class, null, null, "TEST", null, "TEST"); - } - // Basic tests from RegexNERAnnotatorITest public void testBasicMatching() throws Exception { String str = "President Barack Obama lives in Chicago , Illinois , " + @@ -217,9 +154,9 @@ public void testBasicMatching() throws Exception { annotator.annotate(document); List tokens = document.get(CoreAnnotations.TokensAnnotation.class); - checkNerTags(tokens, - "TITLE", "PERSON", "PERSON", "O", "O", "LOCATION", "O", "STATE_OR_PROVINCE", - "O", "O", "O", "O", "O", "IDEOLOGY", "O"); + checkTags(tokens, + "TITLE", "PERSON", "PERSON", "O", "O", "LOCATION", "O", "STATE_OR_PROVINCE", + "O", "O", "O", "O", "O", "IDEOLOGY", "O"); } @@ -234,8 +171,8 @@ public void testOverwrite() throws Exception { annotator.annotate(document); List tokens = document.get(CoreAnnotations.TokensAnnotation.class); - checkNerTags(tokens, "O", "O", "LOCATION", "LOCATION", "O", "O", "O", "O", "O", "RELIGION", - "RELIGION", "RELIGION", "O", "O", "O"); + checkTags(tokens, "O", "O", "LOCATION", "LOCATION", "O", "O", "O", "O", "O", "RELIGION", + "RELIGION", "RELIGION", "O", "O", "O"); } @@ -248,7 +185,7 @@ public void testPriority() throws Exception { Annotation document = createDocument(str); annotator.annotate(document); List tokens = document.get(CoreAnnotations.TokensAnnotation.class); - checkNerTags(tokens, "RELIGION", "O", "O", "O", "O", "O", "O", "O", "RELIGION", "O"); + checkTags(tokens, "RELIGION", "O", "O", "O", "O", "O", "O", "O", "RELIGION", "O"); } diff --git a/liblocal/README b/liblocal/README deleted file mode 100644 index eeb7a076ff..0000000000 --- a/liblocal/README +++ /dev/null @@ -1,153 +0,0 @@ ------------------------------------------------------------------------ -antlr-runtime.jar - -ORIGINAL JAR NAME: antlr-runtime-3.1.2.jar - -VERSION: 3.1.2 - -RELEASE DATE: Feb 21, 2009 - -SOURCE AVAILABLE: Maven Central - -DESCRIPTION: ANTLR runtime, for compiled software - -URL: http://www.antlr.com - -USED BY: -The Quickcheck library - -LAST UPDATE: 2015/10/5 - -LAST UPDATE BY: Keenon Werling - ------------------------------------------------------------------------ -hamcrest-core.jar - -ORIGINAL JAR NAME: hamcrest-core-1.3.jar - -VERSION: 1.3 - -RELEASE DATE: Jul, 2010 - -SOURCE AVAILABLE: Maven Central - -DESCRIPTION: Hamcrest shennanigans, for JUnit - -URL: http://www.hamcrest.org - -USED BY: -The JUnit library - -LAST UPDATE: 2015/10/5 - -LAST UPDATE BY: Keenon Werling - ------------------------------------------------------------------------ -javaruntype.jar - -ORIGINAL JAR NAME: javaruntype-1.2.jar - -VERSION: 1.2 - -RELEASE DATE: Aug, 2010 - -SOURCE AVAILABLE: Maven Central - -DESCRIPTION: Something for Quickcheck - -URL: http://www.javaruntype.org - -USED BY: -The Quickcheck library - -LAST UPDATE: 2015/10/5 - -LAST UPDATE BY: Keenon Werling - ------------------------------------------------------------------------ -junit-quickcheck-core.jar - -ORIGINAL JAR NAME: junit-quickcheck-core-0.4-beta-3.jar - -VERSION: 0.4-beta-3 - -RELEASE DATE: Nov, 2013 - -SOURCE AVAILABLE: Maven Central - -DESCRIPTION: Quickcheck, runs random inputs and validates outputs - -URL: https://github.com/pholser/junit-quickcheck - -USED BY: -The Quickcheck library - -LAST UPDATE: 2015/10/5 - -LAST UPDATE BY: Keenon Werling - ------------------------------------------------------------------------ -junit-quickcheck-generators.jar - -ORIGINAL JAR NAME: junit-quickcheck-generators-0.4-beta-3.jar - -VERSION: 0.4-beta-3 - -RELEASE DATE: Nov, 2013 - -SOURCE AVAILABLE: Maven Central - -DESCRIPTION: Quickcheck, runs random inputs and validates outputs - -URL: https://github.com/pholser/junit-quickcheck - -USED BY: -The Quickcheck library - -LAST UPDATE: 2015/10/5 - -LAST UPDATE BY: Keenon Werling - ------------------------------------------------------------------------ -junit-theories.jar - -ORIGINAL JAR NAME: junit-theories-4.12.jar - -VERSION: 4.12 - -RELEASE DATE: Dec, 2014 - -SOURCE AVAILABLE: Maven Central - -DESCRIPTION: JUnit theories run JUnit against a number of inputs - -URL: junit.org - -USED BY: -The Quickcheck library - -LAST UPDATE: 2015/10/5 - -LAST UPDATE BY: Keenon Werling - ------------------------------------------------------------------------ -ognl.jar - -ORIGINAL JAR NAME: ognl-3.05.jar - -VERSION: 3.05 - -RELEASE DATE: Apr, 2012 - -SOURCE AVAILABLE: Maven Central - -DESCRIPTION: Object graph navigation library, used by Quickcheck - -URL: https://commons.apache.org/proper/commons-ognl/ - -USED BY: -The Quickcheck library - -LAST UPDATE: 2015/10/5 - -LAST UPDATE BY: Keenon Werling diff --git a/liblocal/antlr-runtime.jar b/liblocal/antlr-runtime-3.1.2.jar similarity index 100% rename from liblocal/antlr-runtime.jar rename to liblocal/antlr-runtime-3.1.2.jar diff --git a/liblocal/hamcrest-core.jar b/liblocal/hamcrest-core-1.3.jar similarity index 100% rename from liblocal/hamcrest-core.jar rename to liblocal/hamcrest-core-1.3.jar diff --git a/liblocal/javaruntype.jar b/liblocal/javaruntype-1.2.jar similarity index 100% rename from liblocal/javaruntype.jar rename to liblocal/javaruntype-1.2.jar diff --git a/liblocal/junit-quickcheck-core.jar b/liblocal/junit-quickcheck-core-0.4-beta-3.jar similarity index 100% rename from liblocal/junit-quickcheck-core.jar rename to liblocal/junit-quickcheck-core-0.4-beta-3.jar diff --git a/liblocal/junit-quickcheck-generators.jar b/liblocal/junit-quickcheck-generators-0.4-beta-3.jar similarity index 100% rename from liblocal/junit-quickcheck-generators.jar rename to liblocal/junit-quickcheck-generators-0.4-beta-3.jar diff --git a/liblocal/junit-theories.jar b/liblocal/junit-theories-4.12.jar similarity index 100% rename from liblocal/junit-theories.jar rename to liblocal/junit-theories-4.12.jar diff --git a/liblocal/ognl.jar b/liblocal/ognl-3.0.5.jar similarity index 100% rename from liblocal/ognl.jar rename to liblocal/ognl-3.0.5.jar diff --git a/src/edu/stanford/nlp/classify/LogConditionalObjectiveFunction.java b/src/edu/stanford/nlp/classify/LogConditionalObjectiveFunction.java index 84f3529eed..0debfe3572 100644 --- a/src/edu/stanford/nlp/classify/LogConditionalObjectiveFunction.java +++ b/src/edu/stanford/nlp/classify/LogConditionalObjectiveFunction.java @@ -72,6 +72,7 @@ public class LogConditionalObjectiveFunction extends AbstractStochasticCac /** Multithreading gradient calculations is a bit cheaper if you reuse the threads. */ protected int threads = Execution.threads; + protected ExecutorService executorService = Executors.newFixedThreadPool(threads); @Override public int domainDimension() { @@ -325,7 +326,7 @@ private void calculateCLbatch(double[] x) { CountDownLatch latch = new CountDownLatch(threads); for (int i = 0; i < threads; i++) { runnables[i] = new CLBatchDerivativeCalculation(threads, i, null, x, derivative.length, latch); - new Thread(runnables[i]).start(); + executorService.execute(runnables[i]); } try { latch.await(); @@ -683,7 +684,7 @@ public double calculateStochasticUpdate(double[] x, double xscale, int[] batch, CountDownLatch latch = new CountDownLatch(threads); for (int i = 0; i < threads; i++) { runnables[i] = new CLBatchDerivativeCalculation(threads, i, batch, x, x.length, latch); - new Thread(runnables[i]).start(); + executorService.execute(runnables[i]); } try { latch.await(); @@ -1004,7 +1005,7 @@ protected void rvfcalculate(double[] x) { CountDownLatch latch = new CountDownLatch(threads); for (int i = 0; i < threads; i++) { runnables[i] = new RVFDerivativeCalculation(threads, i, x, derivative.length, latch); - new Thread(runnables[i]).start(); + executorService.execute(runnables[i]); } try { latch.await(); diff --git a/src/edu/stanford/nlp/ie/NumberNormalizer.java b/src/edu/stanford/nlp/ie/NumberNormalizer.java index b278ff1d25..435b83dee1 100644 --- a/src/edu/stanford/nlp/ie/NumberNormalizer.java +++ b/src/edu/stanford/nlp/ie/NumberNormalizer.java @@ -13,6 +13,7 @@ import java.math.BigDecimal; import java.util.*; +import java.util.function.Function; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; @@ -691,7 +692,8 @@ public static List findNumberRanges(CoreMap annotation) Number v1 = matched.get(0).get(CoreAnnotations.NumericCompositeValueAnnotation.class); Number v2 = matched.get(matched.size()-1).get(CoreAnnotations.NumericCompositeValueAnnotation.class); if (v2.doubleValue() > v1.doubleValue()) { - CoreMap newChunk = CoreMapAggregator.getDefaultAggregator().merge(numerizedTokens, matcher.start(), matcher.end()); + CoreMap newChunk = ChunkAnnotationUtils.getMergedChunk(numerizedTokens, matcher.start(), matcher.end(), + CoreMapAttributeAggregator.getDefaultAggregators()); newChunk.set(CoreAnnotations.NumericCompositeTypeAnnotation.class, "NUMBER_RANGE"); Pair range = new Pair(v1,v2); newChunk.set(CoreAnnotations.NumericCompositeObjectAnnotation.class, range); diff --git a/src/edu/stanford/nlp/ie/crf/CRFLogConditionalObjectiveFunction.java b/src/edu/stanford/nlp/ie/crf/CRFLogConditionalObjectiveFunction.java index 640c76d9f2..cc2aac06e7 100644 --- a/src/edu/stanford/nlp/ie/crf/CRFLogConditionalObjectiveFunction.java +++ b/src/edu/stanford/nlp/ie/crf/CRFLogConditionalObjectiveFunction.java @@ -348,6 +348,18 @@ protected double regularGradientAndValue() { return multiThreadGradient(docIDs, false); } + private class GradientCalculator implements Runnable { + @Override + public void run() { + double[][] arr = empty2D(); + } + } + + protected double newMultithreadGradient(List docIDs, boolean calculateEmpirical) { + double objective = 0.0; + return objective; + } + protected double multiThreadGradient(List docIDs, boolean calculateEmpirical) { double objective = 0.0; // TODO: This is a bunch of unnecessary heap traffic, should all be on the stack diff --git a/src/edu/stanford/nlp/ie/ner/CMMClassifier.java b/src/edu/stanford/nlp/ie/ner/CMMClassifier.java index 13dad7a495..644e139b85 100644 --- a/src/edu/stanford/nlp/ie/ner/CMMClassifier.java +++ b/src/edu/stanford/nlp/ie/ner/CMMClassifier.java @@ -53,7 +53,6 @@ import edu.stanford.nlp.ie.AbstractSequenceClassifier; import edu.stanford.nlp.ie.NERFeatureFactory; import edu.stanford.nlp.io.IOUtils; -import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.ling.BasicDatum; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.Datum; @@ -649,12 +648,11 @@ public Dataset getDataset(Collection> data, Index train; if (featureIndex != null && classIndex != null) { - System.err.println(" Using feature/class Index from existing Dataset..."); - System.err.println(" (This is used when getting Dataset from adaptation set. We want to make the index consistent.)"); //pichuan + System.err.println("Using feature/class Index from existing Dataset..."); + System.err.println("(This is used when getting Dataset from adaptation set. We want to make the index consistent.)"); //pichuan train = new Dataset(size, featureIndex, classIndex); } else { train = new Dataset(size); @@ -702,8 +700,7 @@ public Dataset getBiasedDataset(ObjectBank> data, Index size += doc.size(); } - System.err.print("Making Dataset ... "); - System.err.flush(); + System.err.println("Making Dataset..."); Dataset train = new Dataset(size, featureIndex, classIndex); for (List doc : data) { @@ -917,7 +914,7 @@ private void trainSemiSup(Dataset data, Dataset } LinearClassifierFactory lcf; - lcf = new LinearClassifierFactory<>(flags.tolerance, flags.useSum, prior, flags.sigma, flags.epsilon, flags.QNsize); + lcf = new LinearClassifierFactory(flags.tolerance, flags.useSum, prior, flags.sigma, flags.epsilon, flags.QNsize); if (flags.useQN) { lcf.useQuasiNewton(); } else{ @@ -1189,7 +1186,8 @@ private void makeAnswerArraysAndTagIndex(Collection> docs) { answerArrays.add(Arrays.asList(seq)); } } - for (IN wordInfo : doc) { + for (int i = 0; i < leng; i++) { + CoreLabel wordInfo = doc.get(i); classIndex.add(wordInfo.get(CoreAnnotations.AnswerAnnotation.class)); } @@ -1279,9 +1277,8 @@ private static Collection addOtherClasses(Collection feats, List private static List> getThresholds(String filename) { - BufferedReader in = null; try { - in = IOUtils.readerFromString(filename); + BufferedReader in = IOUtils.readerFromString(filename); List> thresholds = new ArrayList<>(); for (String line; (line = in.readLine()) != null; ) { int i = line.lastIndexOf(' '); @@ -1293,10 +1290,8 @@ private static List> getThresholds(String filename) { } in.close(); return thresholds; - } catch (IOException e) { - throw new RuntimeIOException("Error reading threshold file", e); - } finally { - IOUtils.closeIgnoringExceptions(in); + } catch (Exception e) { + throw new RuntimeException("Error reading threshold file", e); } } @@ -1311,8 +1306,8 @@ public void trainSemiSup() { ObjectBank> biasedData = makeObjectBankFromFile(biasedFilename, readerAndWriter); - Index featureIndex = new HashIndex<>(); - Index classIndex = new HashIndex<>(); + Index featureIndex = new HashIndex(); + Index classIndex = new HashIndex(); Dataset dataset = getDataset(data, featureIndex, classIndex); Dataset biasedDataset = getBiasedDataset(biasedData, featureIndex, classIndex); @@ -1355,27 +1350,25 @@ public void trainSemiSup() { trainSemiSup(dataset, biasedDataset, confusionMatrix); } - static class Scorer implements SequenceModel { + private CMMClassifier classifier = null; - private final CMMClassifier classifier; - - private final int[] tagArray; - private final int[] backgroundTags; - private final Index tagIndex; - private final List lineInfos; - private final int pre; - private final int post; - private final Set> legalTags; + private int[] tagArray = null; + private int[] backgroundTags = null; + private Index tagIndex = null; + private List lineInfos = null; + private int pre = 0; + private int post = 0; + private Set> legalTags = null; private static final boolean VERBOSE = false; - private static int[] buildTagArray(int sz) { - int[] temp = new int[sz]; + void buildTagArray() { + int sz = tagIndex.size(); + tagArray = new int[sz]; for (int i = 0; i < sz; i++) { - temp[i] = i; + tagArray[i] = i; } - return temp; } @Override @@ -1395,14 +1388,14 @@ public int rightWindow() { @Override public int[] getPossibleValues(int position) { - // if (position == 0 || position == lineInfos.size() - 1) { - // int[] a = new int[1]; - // a[0] = tagIndex.indexOf(BACKGROUND); - // return a; - // } - // if (tagArray == null) { - // buildTagArray(); - // } + // if (position == 0 || position == lineInfos.size() - 1) { + // int[] a = new int[1]; + // a[0] = tagIndex.indexOf(BACKGROUND); + // return a; + // } + if (tagArray == null) { + buildTagArray(); + } if (position < pre) { return backgroundTags; } @@ -1556,16 +1549,15 @@ static double[] recenter(double[] x) { this.classifier = classifier; this.legalTags = legalTags; backgroundTags = new int[]{tagIndex.indexOf(classifier.flags.backgroundSymbol)}; - tagArray = buildTagArray(tagIndex.size()); } - } // end static class Scorer + } // end class Scorer private boolean normalize() { return flags.normalize; } - static int lastPos = -1; // TODO: Looks like CMMClassifier still isn't threadsafe! + static int lastPos = -1; public Counter scoresOf(List lineInfos, int pos) { // if (pos != lastPos) { diff --git a/src/edu/stanford/nlp/ling/CoreLabel.java b/src/edu/stanford/nlp/ling/CoreLabel.java index e4c9cbd37e..eb60c4d3db 100644 --- a/src/edu/stanford/nlp/ling/CoreLabel.java +++ b/src/edu/stanford/nlp/ling/CoreLabel.java @@ -554,7 +554,7 @@ public void setEndPosition(int endPos) { public static final String TAG_SEPARATOR = "/"; public enum OutputFormat { - VALUE_INDEX, VALUE, VALUE_TAG, VALUE_TAG_INDEX, MAP, VALUE_MAP, VALUE_INDEX_MAP, WORD, WORD_INDEX, VALUE_TAG_NER, LEMMA_INDEX, ALL + VALUE_INDEX, VALUE, VALUE_TAG, VALUE_TAG_INDEX, MAP, VALUE_MAP, VALUE_INDEX_MAP, WORD, WORD_INDEX, VALUE_TAG_NER, ALL } public static final OutputFormat DEFAULT_FORMAT = OutputFormat.VALUE_INDEX; @@ -684,13 +684,6 @@ public String toString(OutputFormat format) { } break; } - case LEMMA_INDEX: - buf.append(lemma()); - Integer index = this.get(CoreAnnotations.IndexAnnotation.class); - if (index != null) { - buf.append('-').append((index).intValue()); - } - break; case ALL:{ for(Class en: this.keySet()){ buf.append(";").append(en).append(":").append(this.get(en)); diff --git a/src/edu/stanford/nlp/ling/tokensregex/BasicSequenceMatchResult.java b/src/edu/stanford/nlp/ling/tokensregex/BasicSequenceMatchResult.java index efb303edf3..5c1057fd64 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/BasicSequenceMatchResult.java +++ b/src/edu/stanford/nlp/ling/tokensregex/BasicSequenceMatchResult.java @@ -43,7 +43,7 @@ public BasicSequenceMatchResult toBasicSequenceMatchResult() { } public BasicSequenceMatchResult copy() { - BasicSequenceMatchResult res = new BasicSequenceMatchResult<>(); + BasicSequenceMatchResult res = new BasicSequenceMatchResult(); res.pattern = pattern; res.elements = elements; res.matchedGroups = new MatchedGroup[matchedGroups.length]; @@ -58,7 +58,7 @@ public BasicSequenceMatchResult copy() { } } if (matchedResults != null) { - res.matchedResults = new Object[matchedResults.length]; + res.matchedResults = new Object[res.matchedResults.length]; System.arraycopy(res.matchedResults, 0, matchedResults, 0, matchedResults.length); } return res; @@ -180,11 +180,11 @@ public List groupNodes() { public List groupNodes(int group) { if (group == GROUP_BEFORE_MATCH || group == GROUP_AFTER_MATCH) { // return a new list so the resulting object is serializable - return new ArrayList<>(elements.subList(start(group), end(group))); + return new ArrayList(elements.subList(start(group), end(group))); } if (matchedGroups[group] != null) { // return a new list so the resulting object is serializable - return new ArrayList<>(elements.subList(matchedGroups[group].matchBegin, matchedGroups[group].matchEnd)); + return new ArrayList(elements.subList(matchedGroups[group].matchBegin, matchedGroups[group].matchEnd)); } else { return null; } @@ -209,7 +209,7 @@ public Object groupValue() { public Object groupValue(int group) { if (group == GROUP_BEFORE_MATCH || group == GROUP_AFTER_MATCH) { // return a new list so the resulting object is serializable - return new ArrayList<>(elements.subList(start(group), end(group))); + return new ArrayList(elements.subList(start(group), end(group))); } if (matchedGroups[group] != null) { return matchedGroups[group].value; @@ -240,8 +240,7 @@ public MatchedGroupInfo groupInfo(int group) { Object value = groupValue(group); String text = group(group); List matchedResults = groupMatchResults(group); - String varName = group >= this.varGroupBindings.varnames.length ? null : this.varGroupBindings.varnames[group]; - return new MatchedGroupInfo<>(text, nodes, matchedResults, value, varName); + return new MatchedGroupInfo(text, nodes, matchedResults, value); } else { return null; } diff --git a/src/edu/stanford/nlp/ling/tokensregex/ComplexNodePattern.java b/src/edu/stanford/nlp/ling/tokensregex/ComplexNodePattern.java deleted file mode 100644 index 4293b56730..0000000000 --- a/src/edu/stanford/nlp/ling/tokensregex/ComplexNodePattern.java +++ /dev/null @@ -1,466 +0,0 @@ -package edu.stanford.nlp.ling.tokensregex; - -import edu.stanford.nlp.util.CollectionUtils; -import edu.stanford.nlp.util.Pair; -import edu.stanford.nlp.util.StringUtils; - -import java.util.*; -import java.util.function.BiFunction; -import java.util.function.Function; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** - * Pattern for matching a complex data structure - * - * @author Angel Chang - */ -public class ComplexNodePattern extends NodePattern { - // TODO: Change/Augment from list of class to pattern to list of conditions for matching - // (so we can have more flexible matches) - private final List> annotationPatterns; - private final BiFunction getter; - - - public ComplexNodePattern(BiFunction getter, List> annotationPatterns) { - this.annotationPatterns = annotationPatterns; - this.getter = getter; - } - - public ComplexNodePattern(BiFunction getter, Pair... annotationPatterns) { - this.annotationPatterns = Arrays.asList(annotationPatterns); - this.getter = getter; - } - - public ComplexNodePattern(BiFunction getter, K key, NodePattern pattern) { - this(getter, Pair.makePair(key,pattern)); - } - - public List> getAnnotationPatterns() { - return Collections.unmodifiableList(annotationPatterns); - } - - // TODO: make this a pattern of non special characters: [,],?,.,\,^,$,(,),*,+ ... what else? - private static final Pattern LITERAL_PATTERN = Pattern.compile("[A-Za-z0-9_\\-']*"); - public static NodePattern newStringRegexPattern(String regex, int flags) { - boolean isLiteral = ((flags & Pattern.LITERAL) != 0) || LITERAL_PATTERN.matcher(regex).matches(); - if (isLiteral) { - boolean caseInsensitive = (flags & Pattern.CASE_INSENSITIVE) != 0; - int stringMatchFlags = (caseInsensitive)? CASE_INSENSITIVE:0; - return new StringAnnotationPattern(regex, stringMatchFlags); - } else { - return new StringAnnotationRegexPattern(regex, flags); - } - } - - public static ComplexNodePattern valueOf( - Env env, Map attributes, BiFunction getter, Function,K> getKey) - { - ComplexNodePattern p = new ComplexNodePattern(getter, new ArrayList>(attributes.size())); - p.populate(env, attributes, getKey); - return p; - } - - protected void populate(Env env, Map attributes, Function,K> getKey) { - ComplexNodePattern p = this; - for (String attr:attributes.keySet()) { - String value = attributes.get(attr); - K c = getKey.apply(Pair.makePair(env, attr)); - if (c != null) { - if (value.startsWith("\"") && value.endsWith("\"")) { - value = value.substring(1, value.length() - 1); - value = value.replaceAll("\\\\\"", "\""); // Unescape quotes... - p.add(c, new StringAnnotationPattern(value, env.defaultStringMatchFlags)); - } else if (value.startsWith("/") && value.endsWith("/")) { - value = value.substring(1, value.length() - 1); - value = value.replaceAll("\\\\/", "/"); // Unescape forward slash - String regex = (env != null) ? env.expandStringRegex(value) : value; - int flags = (env != null) ? env.defaultStringPatternFlags : 0; - p.add(c, newStringRegexPattern(regex, flags)); - } else if (value.startsWith("::")) { - switch (value) { - case "::IS_NIL": - case "::NOT_EXISTS": - p.add(c, new NilAnnotationPattern()); - break; - case "::EXISTS": - case "::NOT_NIL": - p.add(c, new NotNilAnnotationPattern()); - break; - case "::IS_NUM": - p.add(c, new NumericAnnotationPattern(0, NumericAnnotationPattern.CmpType.IS_NUM)); - break; - default: - boolean ok = false; - if (env != null) { - Object custom = env.get(value); - if (custom != null) { - p.add(c, (NodePattern) custom); - ok = true; - } - } - if (!ok) { - throw new IllegalArgumentException("Invalid value " + value + " for key: " + attr); - } - break; - } - } else if (value.startsWith("<=")) { - Double v = Double.parseDouble(value.substring(2)); - p.add(c, new NumericAnnotationPattern(v, NumericAnnotationPattern.CmpType.LE)); - } else if (value.startsWith(">=")) { - Double v = Double.parseDouble(value.substring(2)); - p.add(c, new NumericAnnotationPattern(v, NumericAnnotationPattern.CmpType.GE)); - } else if (value.startsWith("==")) { - Double v = Double.parseDouble(value.substring(2)); - p.add(c, new NumericAnnotationPattern(v, NumericAnnotationPattern.CmpType.EQ)); - } else if (value.startsWith("!=")) { - Double v = Double.parseDouble(value.substring(2)); - p.add(c, new NumericAnnotationPattern(v, NumericAnnotationPattern.CmpType.NE)); - } else if (value.startsWith(">")) { - Double v = Double.parseDouble(value.substring(1)); - p.add(c, new NumericAnnotationPattern(v, NumericAnnotationPattern.CmpType.GT)); - } else if (value.startsWith("<")) { - Double v = Double.parseDouble(value.substring(1)); - p.add(c, new NumericAnnotationPattern(v, NumericAnnotationPattern.CmpType.LT)); - } else if (value.matches("[A-Za-z0-9_+-.]+")) { - p.add(c, new StringAnnotationPattern(value, env.defaultStringMatchFlags)); - } else { - throw new IllegalArgumentException("Invalid value " + value + " for key: " + attr); - } - } else { - throw new IllegalArgumentException("Unknown annotation key: " + attr); - } - } - } - - public void add(K c, NodePattern pattern) { - annotationPatterns.add(Pair.makePair(c, pattern)); - } - - @Override - public boolean match(M token) - { - boolean matched = true; - for (Pair entry:annotationPatterns) { - NodePattern annoPattern = entry.second; - Object anno = getter.apply(token, entry.first); - if (!annoPattern.match(anno)) { - matched = false; - break; - } - } - return matched; - } - - @Override - public Object matchWithResult(M token) { - Map matchResults = new HashMap();//Generics.newHashMap(); - if (match(token, matchResults)) { - return matchResults; - } else { - return null; - } - } - - // Does matching, returning match results - protected boolean match(M token, Map matchResults) - { - boolean matched = true; - for (Pair entry:annotationPatterns) { - NodePattern annoPattern = entry.second; - Object anno = getter.apply(token, entry.first); - Object matchResult = annoPattern.matchWithResult(anno); - if (matchResult != null) { - matchResults.put(entry.first, matchResult); - } else { - matched = false; - break; - } - } - return matched; - } - - public String toString() { - StringBuilder sb = new StringBuilder(); - for (Pair entry:annotationPatterns) { - if (sb.length() > 0) { - sb.append(", "); - } - sb.append(entry.first).append(entry.second); - } - return sb.toString(); - } - - public static class NilAnnotationPattern extends NodePattern { - public boolean match(Object obj) { - return obj == null; - } - public String toString() { - return "::IS_NIL"; - } - } - - public static class NotNilAnnotationPattern extends NodePattern { - public boolean match(Object obj) { - return obj != null; - } - public String toString() { - return "::NOT_NIL"; - } - } - - public static class SequenceRegexPattern extends NodePattern> { - SequencePattern pattern; - - public SequenceRegexPattern(SequencePattern pattern) { - this.pattern = pattern; - } - - public SequencePattern getPattern() { - return pattern; - } - - public SequenceMatcher matcher(List list) { - return pattern.getMatcher(list); - } - - public boolean match(List list) { - return pattern.getMatcher(list).matches(); - } - - public Object matchWithResult(List list) { - SequenceMatcher m = pattern.getMatcher(list); - if (m.matches()) { - return m.toBasicSequenceMatchResult(); - } else { - return null; - } - } - - public String toString() { - return ":" + pattern.toString(); - } - } - - public static class StringAnnotationRegexPattern extends NodePattern { - Pattern pattern; - - public StringAnnotationRegexPattern(Pattern pattern) { - this.pattern = pattern; - } - - public StringAnnotationRegexPattern(String regex, int flags) { - this.pattern = Pattern.compile(regex, flags); - } - - public Pattern getPattern() { - return pattern; - } - - public Matcher matcher(String str) { - return pattern.matcher(str); - } - - public boolean match(String str) { - if (str == null) { - return false; - } else { - return pattern.matcher(str).matches(); - } - } - - public Object matchWithResult(String str) { - if (str == null) return null; - Matcher m = pattern.matcher(str); - if (m.matches()) { - return m.toMatchResult(); - } else { - return null; - } - } - - public String toString() { - return ":/" + pattern.pattern() + "/"; - } - } - - public static abstract class AbstractStringAnnotationPattern extends NodePattern { - int flags; - - public boolean ignoreCase() { - return (flags & CASE_INSENSITIVE) != 0; - } - - public boolean normalize() { - return (flags & NORMALIZE) != 0; - } - - public String getNormalized(String str) { - if (normalize()) { - str = StringUtils.normalize(str); - } - if (ignoreCase()) { - str = str.toLowerCase(); - } - return str; - } - } - - public static class StringAnnotationPattern extends AbstractStringAnnotationPattern { - String target; - - public StringAnnotationPattern(String str, int flags) { - this.target = str; - this.flags = flags; - } - - public StringAnnotationPattern(String str) { - this.target = str; - } - - public String getString() { - return target; - } - - public boolean match(String str) { - if (normalize()) { - str = getNormalized(str); - } - if (ignoreCase()) { - return target.equalsIgnoreCase(str); - } else { - return target.equals(str); - } - } - - public String toString() { - return ":" + target; - } - } - - public static class StringInSetAnnotationPattern extends AbstractStringAnnotationPattern { - Set targets; - - public StringInSetAnnotationPattern(Set targets, int flags) { - this.flags = flags; - // if ignoreCase/normalize is true - convert targets to lowercase/normalized - this.targets = new HashSet(targets.size()); - for (String target:targets) { - this.targets.add(getNormalized(target)); - } - } - - public StringInSetAnnotationPattern(Set targets) { - this(targets, 0); - } - - public Set getTargets() { - return targets; - } - - public boolean match(String str) { - return targets.contains(getNormalized(str)); - } - - public String toString() { - return ":" + targets; - } - } - - public static class NumericAnnotationPattern extends NodePattern { - static enum CmpType { - IS_NUM { boolean accept(double v1, double v2) { return true; } }, - EQ { boolean accept(double v1, double v2) { return v1 == v2; } }, // TODO: equal with doubles is not so good - NE { boolean accept(double v1, double v2) { return v1 != v2; } }, // TODO: equal with doubles is not so good - GT { boolean accept(double v1, double v2) { return v1 > v2; } }, - GE { boolean accept(double v1, double v2) { return v1 >= v2; } }, - LT { boolean accept(double v1, double v2) { return v1 < v2; } }, - LE { boolean accept(double v1, double v2) { return v1 <= v2; } }; - boolean accept(double v1, double v2) { return false; } - } - CmpType cmpType; - double value; - - public NumericAnnotationPattern(double value, CmpType cmpType) { - this.value = value; - this.cmpType = cmpType; - } - - @Override - public boolean match(Object node) { - if (node instanceof String) { - return match((String) node); - } else if (node instanceof Number) { - return match((Number) node); - } else { - return false; - } - } - - public boolean match(Number number) { - if (number != null) { - return cmpType.accept(number.doubleValue(), value); - } else { - return false; - } - } - - public boolean match(String str) { - if (str != null) { - try { - double v = Double.parseDouble(str); - return cmpType.accept(v, value); - } catch (NumberFormatException ex) { - } - } - return false; - } - - public String toString() { - return " " + cmpType + " " + value; - } - } - - public static class AttributesEqualMatchChecker implements SequencePattern.NodesMatchChecker> { - Collection keys; - - public AttributesEqualMatchChecker(K... keys) { - this.keys = CollectionUtils.asSet(keys); - } - - public boolean matches(Map o1, Map o2) { - for (K key : keys) { - Object v1 = o1.get(key); - Object v2 = o2.get(key); - if (v1 != null) { - if (!v1.equals(v2)) { - return false; - } - } else { - if (v2 != null) return false; - } - } - return true; - } - } - - //For exact matching integers. Presumably faster than NumericAnnotationPattern - //TODO : add this in the valueOf function of MapNodePattern - public static class IntegerAnnotationPattern extends NodePattern{ - - int value; - public IntegerAnnotationPattern(int v){ - this.value = v; - } - - @Override - public boolean match(Integer node) { - return value == node; - } - - public int getValue() { - return value; - } - } - -} diff --git a/src/edu/stanford/nlp/ling/tokensregex/CoreMapNodePattern.java b/src/edu/stanford/nlp/ling/tokensregex/CoreMapNodePattern.java index 268cda92b0..89cdd40c40 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/CoreMapNodePattern.java +++ b/src/edu/stanford/nlp/ling/tokensregex/CoreMapNodePattern.java @@ -4,7 +4,7 @@ import edu.stanford.nlp.util.*; import java.util.*; -import java.util.function.BiFunction; +import java.util.regex.Matcher; import java.util.regex.Pattern; /** @@ -12,29 +12,41 @@ * * @author Angel Chang */ -public class CoreMapNodePattern extends ComplexNodePattern { +public class CoreMapNodePattern extends NodePattern { + // TODO: Change/Augment from list of class to pattern to list of conditions for matching + // (so we can have more flexible matches) + private final List> annotationPatterns; - private static BiFunction createGetter() { - return new BiFunction() { - @Override - public Object apply(CoreMap m, Class k) { - return m.get(k); - } - }; - } public CoreMapNodePattern(List> annotationPatterns) { - super(createGetter(), annotationPatterns); + this.annotationPatterns = annotationPatterns; } public CoreMapNodePattern(Pair... annotationPatterns) { - super(createGetter(), annotationPatterns); + this.annotationPatterns = Arrays.asList(annotationPatterns); } public CoreMapNodePattern(Class key, NodePattern pattern) { this(Pair.makePair(key,pattern)); } + public List> getAnnotationPatterns() { + return Collections.unmodifiableList(annotationPatterns); + } + + // TODO: make this a pattern of non special characters: [,],?,.,\,^,$,(,),*,+ ... what else? + private static final Pattern LITERAL_PATTERN = Pattern.compile("[A-Za-z0-9_\\-']*"); + public static NodePattern newStringRegexPattern(String regex, int flags) { + boolean isLiteral = ((flags & Pattern.LITERAL) != 0) || LITERAL_PATTERN.matcher(regex).matches(); + if (isLiteral) { + boolean caseInsensitive = (flags & Pattern.CASE_INSENSITIVE) != 0; + int stringMatchFlags = (caseInsensitive)? CASE_INSENSITIVE:0; + return new StringAnnotationPattern(regex, stringMatchFlags); + } else { + return new StringAnnotationRegexPattern(regex, flags); + } + } + public static CoreMapNodePattern valueOf(String textAnnotationPattern) { return valueOf(null, textAnnotationPattern); } @@ -66,15 +78,370 @@ public static CoreMapNodePattern valueOf(Map attributes) { public static CoreMapNodePattern valueOf(Env env, Map attributes) { CoreMapNodePattern p = new CoreMapNodePattern(new ArrayList>(attributes.size())); - p.populate(env, attributes, envAttrPair -> EnvLookup.lookupAnnotationKeyWithClassname(envAttrPair.first, envAttrPair.second)); + for (String attr:attributes.keySet()) { + String value = attributes.get(attr); + Class c = EnvLookup.lookupAnnotationKey(env, attr); + if (c != null) { + if (value.startsWith("\"") && value.endsWith("\"")) { + value = value.substring(1, value.length()-1); + value = value.replaceAll("\\\\\"", "\""); // Unescape quotes... + p.add(c, new StringAnnotationPattern(value, env.defaultStringMatchFlags)); + } else if (value.startsWith("/") && value.endsWith("/")) { + value = value.substring(1, value.length()-1); + value = value.replaceAll("\\\\/", "/"); // Unescape forward slash + String regex = (env != null)? env.expandStringRegex(value): value; + int flags = (env != null)? env.defaultStringPatternFlags: 0; + p.add(c, newStringRegexPattern(regex, flags)); + } else if (value.startsWith("::")) { + switch (value) { + case "::IS_NIL": + case "::NOT_EXISTS": + p.add(c, new NilAnnotationPattern()); + break; + case "::EXISTS": + case "::NOT_NIL": + p.add(c, new NotNilAnnotationPattern()); + break; + case "::IS_NUM": + p.add(c, new NumericAnnotationPattern(0, NumericAnnotationPattern.CmpType.IS_NUM)); + break; + default: + boolean ok = false; + if (env != null) { + Object custom = env.get(value); + if (custom != null) { + p.add(c, (NodePattern) custom); + ok = true; + } + } + if (!ok) { + throw new IllegalArgumentException("Invalid value " + value + " for key: " + attr); + } + break; + } + } else if (value.startsWith("<=")) { + Double v = Double.parseDouble(value.substring(2)); + p.add(c, new NumericAnnotationPattern(v, NumericAnnotationPattern.CmpType.LE)); + } else if (value.startsWith(">=")) { + Double v = Double.parseDouble(value.substring(2)); + p.add(c, new NumericAnnotationPattern(v, NumericAnnotationPattern.CmpType.GE)); + } else if (value.startsWith("==")) { + Double v = Double.parseDouble(value.substring(2)); + p.add(c, new NumericAnnotationPattern(v, NumericAnnotationPattern.CmpType.EQ)); + } else if (value.startsWith("!=")) { + Double v = Double.parseDouble(value.substring(2)); + p.add(c, new NumericAnnotationPattern(v, NumericAnnotationPattern.CmpType.NE)); + } else if (value.startsWith(">")) { + Double v = Double.parseDouble(value.substring(1)); + p.add(c, new NumericAnnotationPattern(v, NumericAnnotationPattern.CmpType.GT)); + } else if (value.startsWith("<")) { + Double v = Double.parseDouble(value.substring(1)); + p.add(c, new NumericAnnotationPattern(v, NumericAnnotationPattern.CmpType.LT)); + } else if (value.matches("[A-Za-z0-9_+-.]+")) { + p.add(c, new StringAnnotationPattern(value, env.defaultStringMatchFlags)); + } else { + throw new IllegalArgumentException("Invalid value " + value + " for key: " + attr); + } + } else { + throw new IllegalArgumentException("Unknown annotation key: " + attr); + } + } return p; } - public static class AttributesEqualMatchChecker implements SequencePattern.NodesMatchChecker { + public void add(Class c, NodePattern pattern) { + annotationPatterns.add(Pair.makePair(c, pattern)); + } + + @Override + public boolean match(CoreMap token) + { + boolean matched = true; + for (Pair entry:annotationPatterns) { + NodePattern annoPattern = entry.second; + Object anno = token.get(entry.first); + if (!annoPattern.match(anno)) { + matched = false; + break; + } + } + return matched; + } + + @Override + public Object matchWithResult(CoreMap token) { + Map matchResults = new HashMap();//Generics.newHashMap(); + if (match(token, matchResults)) { + return matchResults; + } else { + return null; + } + } + + // Does matching, returning match results + protected boolean match(CoreMap token, Map matchResults) + { + + boolean matched = true; + for (Pair entry:annotationPatterns) { + NodePattern annoPattern = entry.second; + Object anno = token.get(entry.first); + Object matchResult = annoPattern.matchWithResult(anno); + if (matchResult != null) { + matchResults.put(entry.first, matchResult); + } else { + matched = false; + break; + } + } + return matched; + } + + public String toString() { + StringBuilder sb = new StringBuilder(); + for (Pair entry:annotationPatterns) { + if (sb.length() > 0) { + sb.append(", "); + } + sb.append(entry.first).append(entry.second); + } + return sb.toString(); + } + + public static class NilAnnotationPattern extends NodePattern { + public boolean match(Object obj) { + return obj == null; + } + public String toString() { + return "::IS_NIL"; + } + } + + public static class NotNilAnnotationPattern extends NodePattern { + public boolean match(Object obj) { + return obj != null; + } + public String toString() { + return "::NOT_NIL"; + } + } + + public static class SequenceRegexPattern extends NodePattern> { + SequencePattern pattern; + + public SequenceRegexPattern(SequencePattern pattern) { + this.pattern = pattern; + } + + public SequencePattern getPattern() { + return pattern; + } + + public SequenceMatcher matcher(List list) { + return pattern.getMatcher(list); + } + + public boolean match(List list) { + return pattern.getMatcher(list).matches(); + } + + public Object matchWithResult(List list) { + SequenceMatcher m = pattern.getMatcher(list); + if (m.matches()) { + return m.toBasicSequenceMatchResult(); + } else { + return null; + } + } + + public String toString() { + return ":" + pattern.toString(); + } + } + + public static class StringAnnotationRegexPattern extends NodePattern { + Pattern pattern; + + public StringAnnotationRegexPattern(Pattern pattern) { + this.pattern = pattern; + } + + public StringAnnotationRegexPattern(String regex, int flags) { + this.pattern = Pattern.compile(regex, flags); + } + + public Pattern getPattern() { + return pattern; + } + + public Matcher matcher(String str) { + return pattern.matcher(str); + } + + public boolean match(String str) { + if (str == null) { + return false; + } else { + return pattern.matcher(str).matches(); + } + } + + public Object matchWithResult(String str) { + Matcher m = pattern.matcher(str); + if (m.matches()) { + return m.toMatchResult(); + } else { + return null; + } + } + + public String toString() { + return ":/" + pattern.pattern() + "/"; + } + } + + public static abstract class AbstractStringAnnotationPattern extends NodePattern { + int flags; + + public boolean ignoreCase() { + return (flags & CASE_INSENSITIVE) != 0; + } + + public boolean normalize() { + return (flags & NORMALIZE) != 0; + } + + public String getNormalized(String str) { + if (normalize()) { + str = StringUtils.normalize(str); + } + if (ignoreCase()) { + str = str.toLowerCase(); + } + return str; + } + } + + public static class StringAnnotationPattern extends AbstractStringAnnotationPattern { + String target; + + public StringAnnotationPattern(String str, int flags) { + this.target = str; + this.flags = flags; + } + + public StringAnnotationPattern(String str) { + this.target = str; + } + + public String getString() { + return target; + } + + public boolean match(String str) { + if (normalize()) { + str = getNormalized(str); + } + if (ignoreCase()) { + return target.equalsIgnoreCase(str); + } else { + return target.equals(str); + } + } + + public String toString() { + return ":" + target; + } + } + + public static class StringInSetAnnotationPattern extends AbstractStringAnnotationPattern { + Set targets; + + public StringInSetAnnotationPattern(Set targets, int flags) { + this.flags = flags; + // if ignoreCase/normalize is true - convert targets to lowercase/normalized + this.targets = new HashSet(targets.size()); + for (String target:targets) { + this.targets.add(getNormalized(target)); + } + } + + public StringInSetAnnotationPattern(Set targets) { + this(targets, 0); + } + + public Set getTargets() { + return targets; + } + + public boolean match(String str) { + return targets.contains(getNormalized(str)); + } + + public String toString() { + return ":" + targets; + } + } + + public static class NumericAnnotationPattern extends NodePattern { + static enum CmpType { + IS_NUM { boolean accept(double v1, double v2) { return true; } }, + EQ { boolean accept(double v1, double v2) { return v1 == v2; } }, // TODO: equal with doubles is not so good + NE { boolean accept(double v1, double v2) { return v1 != v2; } }, // TODO: equal with doubles is not so good + GT { boolean accept(double v1, double v2) { return v1 > v2; } }, + GE { boolean accept(double v1, double v2) { return v1 >= v2; } }, + LT { boolean accept(double v1, double v2) { return v1 < v2; } }, + LE { boolean accept(double v1, double v2) { return v1 <= v2; } }; + boolean accept(double v1, double v2) { return false; } + } + CmpType cmpType; + double value; + + public NumericAnnotationPattern(double value, CmpType cmpType) { + this.value = value; + this.cmpType = cmpType; + } + + @Override + public boolean match(Object node) { + if (node instanceof String) { + return match((String) node); + } else if (node instanceof Number) { + return match((Number) node); + } else { + return false; + } + } + + public boolean match(Number number) { + if (number != null) { + return cmpType.accept(number.doubleValue(), value); + } else { + return false; + } + } + + public boolean match(String str) { + if (str != null) { + try { + double v = Double.parseDouble(str); + return cmpType.accept(v, value); + } catch (NumberFormatException ex) { + } + } + return false; + } + + public String toString() { + return " " + cmpType + " " + value; + } + } + + public static class AttributesEqualMatchChecker implements SequencePattern.NodesMatchChecker { Collection keys; - public AttributesEqualMatchChecker(Class... keys) { - this.keys = CollectionUtils.asSet(keys); + public AttributesEqualMatchChecker(Class... classes) { + keys = CollectionUtils.asSet(classes); } public boolean matches(CoreMap o1, CoreMap o2) { @@ -94,5 +461,25 @@ public boolean matches(CoreMap o1, CoreMap o2) { } public static final AttributesEqualMatchChecker TEXT_ATTR_EQUAL_CHECKER = - new CoreMapNodePattern.AttributesEqualMatchChecker(CoreAnnotations.TextAnnotation.class); + new AttributesEqualMatchChecker(CoreAnnotations.TextAnnotation.class); + + //For exact matching integers. Presumably faster than NumericAnnotationPattern + //TODO : add this in the valueOf function of CoreMapNodePattern + public static class IntegerAnnotationPattern extends NodePattern{ + + int value; + public IntegerAnnotationPattern(int v){ + this.value = v; + } + + @Override + public boolean match(Integer node) { + return value == node; + } + + public int getValue() { + return value; + } + } + } diff --git a/src/edu/stanford/nlp/ling/tokensregex/CoreMapNodePatternTrigger.java b/src/edu/stanford/nlp/ling/tokensregex/CoreMapNodePatternTrigger.java index 53c8d2bb9f..ff6a0ff462 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/CoreMapNodePatternTrigger.java +++ b/src/edu/stanford/nlp/ling/tokensregex/CoreMapNodePatternTrigger.java @@ -42,7 +42,7 @@ public CoreMapNodePatternTrigger(Collection> for (SequencePattern pattern:patterns) { // Look for first string... - Collection triggerCandidates = pattern.findNodePatterns(stringTriggerFilter, false, true); + Collection triggerCandidates = pattern.findNodePatterns(stringTriggerFilter); // TODO: Select most unlikely to trigger trigger from the triggerCandidates // (if we had some statistics on most frequent annotation values...., then pick least frequent) // For now, just pick the longest: going from (text or lemma) to rest diff --git a/src/edu/stanford/nlp/ling/tokensregex/CoreMapSequenceMatcher.java b/src/edu/stanford/nlp/ling/tokensregex/CoreMapSequenceMatcher.java index e6a91e643f..b3f5d65f13 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/CoreMapSequenceMatcher.java +++ b/src/edu/stanford/nlp/ling/tokensregex/CoreMapSequenceMatcher.java @@ -2,7 +2,7 @@ import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.pipeline.ChunkAnnotationUtils; -import edu.stanford.nlp.pipeline.CoreMapAggregator; +import edu.stanford.nlp.pipeline.CoreMapAttributeAggregator; import edu.stanford.nlp.util.CollectionUtils; import edu.stanford.nlp.util.CoreMap; import java.util.function.Function; @@ -90,7 +90,7 @@ private CoreMap createMergedChunk(int groupStart, int groupEnd) } */ if (merged == null) { // Okay, have to go through these one by one and merge them - merged = CoreMapAggregator.getDefaultAggregator().merge(elements, groupStart, groupEnd); + merged = ChunkAnnotationUtils.getMergedChunk(elements, groupStart, groupEnd, CoreMapAttributeAggregator.getDefaultAggregators()); } return merged; } diff --git a/src/edu/stanford/nlp/ling/tokensregex/Env.java b/src/edu/stanford/nlp/ling/tokensregex/Env.java index fd5baa460e..6f7ee6b1de 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/Env.java +++ b/src/edu/stanford/nlp/ling/tokensregex/Env.java @@ -2,11 +2,8 @@ import edu.stanford.nlp.ling.tokensregex.types.Expressions; import edu.stanford.nlp.ling.tokensregex.types.Tags; -import edu.stanford.nlp.pipeline.CoreMapAggregator; import edu.stanford.nlp.pipeline.CoreMapAttributeAggregator; import java.util.function.Function; - -import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.util.Pair; import java.util.*; @@ -109,15 +106,7 @@ public class Env { */ public Map defaultTokensAggregators; - private CoreMapAggregator defaultTokensAggregator; - /** - * Whether we should merge and output corelabels or not - */ - public boolean aggregateToTokens; - - - /** * How annotations are extracted from the MatchedExpression. * If the result type is a List and more than one annotation key is specified, * then the result is paired with the annotation key. @@ -161,18 +150,6 @@ public void setDefaultTokensAggregators(Map d this.defaultTokensAggregators = defaultTokensAggregators; } - public CoreMapAggregator getDefaultTokensAggregator() { - if (defaultTokensAggregator == null && (defaultTokensAggregators != null || aggregateToTokens)) { - CoreLabelTokenFactory tokenFactory = (aggregateToTokens)? new CoreLabelTokenFactory():null; - Map aggregators = defaultTokensAggregators; - if (aggregators == null) { - aggregators = CoreMapAttributeAggregator.DEFAULT_NUMERIC_TOKENS_AGGREGATORS; - } - defaultTokensAggregator = CoreMapAggregator.getAggregator(aggregators, null, tokenFactory); - } - return defaultTokensAggregator; - } - public Class getDefaultTextAnnotationKey() { return defaultTextAnnotationKey; } diff --git a/src/edu/stanford/nlp/ling/tokensregex/EnvLookup.java b/src/edu/stanford/nlp/ling/tokensregex/EnvLookup.java index 8d14cc07e3..e13f865cb7 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/EnvLookup.java +++ b/src/edu/stanford/nlp/ling/tokensregex/EnvLookup.java @@ -3,7 +3,6 @@ import edu.stanford.nlp.ling.AnnotationLookup; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.tokensregex.types.Value; -import edu.stanford.nlp.pipeline.CoreMapAggregator; import edu.stanford.nlp.pipeline.CoreMapAttributeAggregator; import java.util.function.Function; @@ -68,17 +67,6 @@ public static Map getDefaultTokensAggregators return CoreMapAttributeAggregator.DEFAULT_NUMERIC_TOKENS_AGGREGATORS; } - public static CoreMapAggregator getDefaultTokensAggregator(Env env) - { - if (env != null) { - CoreMapAggregator obj = env.getDefaultTokensAggregator(); - if (obj != null) { - return obj; - } - } - return CoreMapAggregator.DEFAULT_NUMERIC_TOKENS_AGGREGATOR; - } - public static List getDefaultTokensResultAnnotationKey(Env env) { if (env != null) { diff --git a/src/edu/stanford/nlp/ling/tokensregex/MapNodePattern.java b/src/edu/stanford/nlp/ling/tokensregex/MapNodePattern.java deleted file mode 100644 index 07777c1a94..0000000000 --- a/src/edu/stanford/nlp/ling/tokensregex/MapNodePattern.java +++ /dev/null @@ -1,37 +0,0 @@ -package edu.stanford.nlp.ling.tokensregex; - -import edu.stanford.nlp.util.Pair; - -import java.util.List; -import java.util.Map; -import java.util.function.BiFunction; - -/** - * Pattern for matching a Map from keys K to objects - * - * @author Angel Chang - */ -public class MapNodePattern, K> extends ComplexNodePattern { - - private static , K> BiFunction createGetter() { - return new BiFunction() { - @Override - public Object apply(M m, K k) { - return m.get(k); - } - }; - } - - public MapNodePattern(List> annotationPatterns) { - super(createGetter(), annotationPatterns); - } - - public MapNodePattern(Pair... annotationPatterns) { - super(createGetter(), annotationPatterns); - } - - public MapNodePattern(K key, NodePattern pattern) { - super(createGetter(), key, pattern); - } - -} diff --git a/src/edu/stanford/nlp/ling/tokensregex/MatchedExpression.java b/src/edu/stanford/nlp/ling/tokensregex/MatchedExpression.java index 0dd140c785..ec3e15dd7e 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/MatchedExpression.java +++ b/src/edu/stanford/nlp/ling/tokensregex/MatchedExpression.java @@ -3,7 +3,7 @@ import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.tokensregex.types.Value; import edu.stanford.nlp.pipeline.ChunkAnnotationUtils; -import edu.stanford.nlp.pipeline.CoreMapAggregator; +import edu.stanford.nlp.pipeline.CoreMapAttributeAggregator; import edu.stanford.nlp.util.Comparators; import edu.stanford.nlp.util.CoreMap; import java.util.function.Function; @@ -39,7 +39,6 @@ public class MatchedExpression { // TODO: Should we keep some context from the source so we can perform more complex evaluation? /** Function indicating how to extract an value from annotation built from this expression */ - protected Object context; // Some context to help to extract value from annotation protected SingleAnnotationExtractor extractFunc; public Value value; @@ -65,9 +64,8 @@ public static class SingleAnnotationExtractor implements Function public Class resultNestedAnnotationField; // Annotation field for child/nested annotations public boolean includeNested = false; public Function valueExtractor; - public Function expressionToValue; public Function resultAnnotationExtractor; - public CoreMapAggregator tokensAggregator; + public Map tokensAggregators; @Override public Value apply(CoreMap in) { @@ -111,11 +109,7 @@ public void annotate(MatchedExpression matchedExpression, List charOffsets, Interval tokenO public boolean extractAnnotation(Env env, CoreMap sourceAnnotation) { - return extractAnnotation(sourceAnnotation, extractFunc.tokensAggregator); + return extractAnnotation(sourceAnnotation, extractFunc.tokensAggregators); } private boolean extractAnnotation(CoreMap sourceAnnotation, - CoreMapAggregator aggregator) + Map aggregators) { Class tokensAnnotationKey = extractFunc.tokensAnnotationField; if (chunkOffsets != null) { - annotation = aggregator.merge((List) sourceAnnotation.get(tokensAnnotationKey), - chunkOffsets.getBegin(), chunkOffsets.getEnd()); + annotation = ChunkAnnotationUtils.getMergedChunk((List) sourceAnnotation.get(tokensAnnotationKey), + chunkOffsets.getBegin(), chunkOffsets.getEnd(), aggregators ); if (sourceAnnotation.containsKey(CoreAnnotations.TextAnnotation.class)) { ChunkAnnotationUtils.annotateChunkText(annotation, sourceAnnotation); } @@ -215,8 +209,8 @@ private boolean extractAnnotation(CoreMap sourceAnnotation, chunkOffsets = ChunkAnnotationUtils.getChunkOffsetsUsingCharOffsets((List) sourceAnnotation.get(tokensAnnotationKey), charOffsets.getBegin() + baseCharOffset, charOffsets.getEnd() + baseCharOffset); - CoreMap annotation2 = aggregator.merge((List) sourceAnnotation.get(tokensAnnotationKey), - chunkOffsets.getBegin(), chunkOffsets.getEnd()); + CoreMap annotation2 = ChunkAnnotationUtils.getMergedChunk((List) sourceAnnotation.get(tokensAnnotationKey), + chunkOffsets.getBegin(), chunkOffsets.getEnd(), aggregators ); annotation = ChunkAnnotationUtils.getAnnotatedChunkUsingCharOffsets(sourceAnnotation, charOffsets.getBegin(), charOffsets.getEnd()); tokenOffsets = Interval.toInterval(annotation.get(CoreAnnotations.TokenBeginAnnotation.class), @@ -230,12 +224,12 @@ private boolean extractAnnotation(CoreMap sourceAnnotation, public boolean extractAnnotation(Env env, List source) { - return extractAnnotation(source, CoreMapAggregator.getDefaultAggregator()); + return extractAnnotation(source, CoreMapAttributeAggregator.getDefaultAggregators()); } - protected boolean extractAnnotation(List source, CoreMapAggregator aggregator) + protected boolean extractAnnotation(List source, Map chunkAggregators) { - annotation = aggregator.merge(source, chunkOffsets.getBegin(), chunkOffsets.getEnd()); + annotation = ChunkAnnotationUtils.getMergedChunk(source, chunkOffsets.getBegin(), chunkOffsets.getEnd(), chunkAggregators); charOffsets = Interval.toInterval(annotation.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class), annotation.get(CoreAnnotations.CharacterOffsetEndAnnotation.class), Interval.INTERVAL_OPEN_END); tokenOffsets = Interval.toInterval(annotation.get(CoreAnnotations.TokenBeginAnnotation.class), diff --git a/src/edu/stanford/nlp/ling/tokensregex/MultiCoreMapNodePattern.java b/src/edu/stanford/nlp/ling/tokensregex/MultiCoreMapNodePattern.java index 325060d869..734602a64e 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/MultiCoreMapNodePattern.java +++ b/src/edu/stanford/nlp/ling/tokensregex/MultiCoreMapNodePattern.java @@ -43,7 +43,7 @@ protected Collection> match(List nodes, int maxEnd = maxNodes + start; } for (int end = minEnd; end <= maxEnd; end++) { - CoreMap chunk = ChunkAnnotationUtils.getMergedChunk(nodes, start, end, aggregators, null); + CoreMap chunk = ChunkAnnotationUtils.getMergedChunk(nodes, start, end, aggregators); if (nodePattern.match(chunk)) { matched.add(Interval.toInterval(start, end)); } diff --git a/src/edu/stanford/nlp/ling/tokensregex/MultiNodePattern.java b/src/edu/stanford/nlp/ling/tokensregex/MultiNodePattern.java index 8880032b06..71595d3a02 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/MultiNodePattern.java +++ b/src/edu/stanford/nlp/ling/tokensregex/MultiNodePattern.java @@ -13,7 +13,7 @@ public abstract class MultiNodePattern { int minNodes = 1; int maxNodes = -1; // Set the max number of nodes this pattern can match - boolean greedyMatch = true; + /** * Tries to match sequence of nodes starting of start @@ -39,14 +39,6 @@ public void setMaxNodes(int maxNodes) { this.maxNodes = maxNodes; } - public boolean isGreedyMatch() { - return greedyMatch; - } - - public void setGreedyMatch(boolean greedyMatch) { - this.greedyMatch = greedyMatch; - } - protected static class IntersectMultiNodePattern extends MultiNodePattern { List> nodePatterns; diff --git a/src/edu/stanford/nlp/ling/tokensregex/PhraseTable.java b/src/edu/stanford/nlp/ling/tokensregex/PhraseTable.java index 7141d89c00..4f56f16fa4 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/PhraseTable.java +++ b/src/edu/stanford/nlp/ling/tokensregex/PhraseTable.java @@ -972,7 +972,7 @@ public String toString() } public final static Comparator PHRASEMATCH_LENGTH_ENDPOINTS_COMPARATOR = - Comparators.chain(HasInterval.LENGTH_GT_COMPARATOR, HasInterval.ENDPOINTS_COMPARATOR); + Comparators.chain(HasInterval.LENGTH_COMPARATOR, HasInterval.ENDPOINTS_COMPARATOR); /** * Represents a matched phrase diff --git a/src/edu/stanford/nlp/ling/tokensregex/SequenceMatchResult.java b/src/edu/stanford/nlp/ling/tokensregex/SequenceMatchResult.java index fef5fa0875..9d6189753d 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/SequenceMatchResult.java +++ b/src/edu/stanford/nlp/ling/tokensregex/SequenceMatchResult.java @@ -354,18 +354,16 @@ public Interval apply(MR in) { * @param */ public final static class MatchedGroupInfo { - public final String text; - public final List nodes; - public final List matchResults; - public final Object value; - public final String varName; + public String text; + public List nodes; + public List matchResults; + public Object value; - public MatchedGroupInfo(String text, List nodes, List matchResults, Object value, String varName) { + public MatchedGroupInfo(String text, List nodes, List matchResults, Object value) { this.text = text; this.nodes = nodes; this.matchResults = matchResults; this.value = value; - this.varName = varName; } } } diff --git a/src/edu/stanford/nlp/ling/tokensregex/SequenceMatchRules.java b/src/edu/stanford/nlp/ling/tokensregex/SequenceMatchRules.java index 05866443f0..dbb840c2db 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/SequenceMatchRules.java +++ b/src/edu/stanford/nlp/ling/tokensregex/SequenceMatchRules.java @@ -240,7 +240,7 @@ public void update(Env env, Map attributes) { if (annoKey instanceof Class) { annotationField = (Class) annoKey; } else if (annoKey instanceof String) { - annotationField = EnvLookup.lookupAnnotationKeyWithClassname(env, (String) annoKey); + annotationField = EnvLookup.lookupAnnotationKey(env, (String) annoKey); } else if (annotationField == null) { annotationField = CoreMap.class; } else { @@ -411,18 +411,19 @@ public AnnotationExtractRule create(Env env, Map attributes) { } public static MatchedExpression.SingleAnnotationExtractor createAnnotationExtractor(Env env, AnnotationExtractRule r) { - MatchedExpression.SingleAnnotationExtractor extractor = new MatchedExpression.SingleAnnotationExtractor(); - extractor.name = r.name; - extractor.tokensAnnotationField = r.tokensAnnotationField; - extractor.tokensResultAnnotationField = r.tokensResultAnnotationField; - extractor.resultAnnotationField = r.resultAnnotationField; - extractor.resultNestedAnnotationField = r.resultNestedAnnotationField; - extractor.priority = r.priority; - extractor.weight = r.weight; - extractor.includeNested = r.includeNested; - extractor.resultAnnotationExtractor = EnvLookup.getDefaultResultAnnotationExtractor(env); - extractor.tokensAggregator = EnvLookup.getDefaultTokensAggregator(env); - return extractor; + MatchedExpression.SingleAnnotationExtractor valueExtractor = + new MatchedExpression.SingleAnnotationExtractor(); + valueExtractor.name = r.name; + valueExtractor.tokensAnnotationField = r.tokensAnnotationField; + valueExtractor.tokensResultAnnotationField = r.tokensResultAnnotationField; + valueExtractor.resultAnnotationField = r.resultAnnotationField; + valueExtractor.resultNestedAnnotationField = r.resultNestedAnnotationField; + valueExtractor.priority = r.priority; + valueExtractor.weight = r.weight; + valueExtractor.includeNested = r.includeNested; + valueExtractor.resultAnnotationExtractor = EnvLookup.getDefaultResultAnnotationExtractor(env); + valueExtractor.tokensAggregators = EnvLookup.getDefaultTokensAggregators(env); + return valueExtractor; } public static class CompositeExtractRuleCreator extends AnnotationExtractRuleCreator { @@ -442,21 +443,16 @@ protected void updateExtractRule(AnnotationExtractRule r, Expression action, Expression result) { - MatchedExpression.SingleAnnotationExtractor annotationExtractor = createAnnotationExtractor(env, r); - SequenceMatchResultExtractor valueExtractor = new SequenceMatchResultExtractor<>(env, action, result); - SequencePatternExtractRule valueExtractRule = new SequencePatternExtractRule<>(pattern, valueExtractor, r.matchFindType, r.matchWithResults); - SequenceMatchedExpressionExtractor exprExtractor = new SequenceMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup ); - SequencePatternExtractRule exprExtractRule = - new SequencePatternExtractRule<>(pattern, exprExtractor, r.matchFindType, r.matchWithResults); - - annotationExtractor.expressionToValue = matched -> { - if (matched != null && matched.context != null && matched.context instanceof SequenceMatchResult ) { - return valueExtractor.apply( (SequenceMatchResult) matched.context); - } else return null; - }; - annotationExtractor.valueExtractor = new CoreMapFunctionApplier<>(env, r.annotationField, valueExtractRule); - r.extractRule = exprExtractRule; - r.filterRule = new AnnotationMatchedFilter(annotationExtractor); + MatchedExpression.SingleAnnotationExtractor valueExtractor = createAnnotationExtractor(env, r); + valueExtractor.valueExtractor = + new CoreMapFunctionApplier< List, Value>( + env, r.annotationField, + new SequencePatternExtractRule( + pattern, + new SequenceMatchResultExtractor(env, action, result), r.matchFindType, r.matchWithResults)); + r.extractRule = new SequencePatternExtractRule(pattern, + new SequenceMatchedExpressionExtractor( valueExtractor, r.matchedExpressionGroup), r.matchFindType, r.matchWithResults); + r.filterRule = new AnnotationMatchedFilter(valueExtractor); r.pattern = pattern; r.result = result; pattern.weight = r.weight; @@ -507,26 +503,30 @@ protected void updateExtractRule(AnnotationExtractRule r, Expression action, Expression result) { - MatchedExpression.SingleAnnotationExtractor annotationExtractor = createAnnotationExtractor(env, r); - SequenceMatchResultExtractor valueExtractor = new SequenceMatchResultExtractor<>(env, action, result); - SequencePatternExtractRule valueExtractRule = new SequencePatternExtractRule<>(pattern, valueExtractor, r.matchFindType, r.matchWithResults); - SequenceMatchedExpressionExtractor exprExtractor = new SequenceMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup ); - SequencePatternExtractRule exprExtractRule = - new SequencePatternExtractRule<>(pattern, exprExtractor, r.matchFindType, r.matchWithResults); - - annotationExtractor.expressionToValue = matched -> { - if (matched != null && matched.context != null && matched.context instanceof SequenceMatchResult ) { - return valueExtractor.apply( (SequenceMatchResult) matched.context); - } else return null; - }; + MatchedExpression.SingleAnnotationExtractor valueExtractor = createAnnotationExtractor(env, r); if (r.annotationField != null && r.annotationField != CoreMap.class) { - annotationExtractor.valueExtractor = new CoreMapFunctionApplier<>(env, r.annotationField, valueExtractRule); - r.extractRule = new CoreMapExtractRule<>(env, r.annotationField, exprExtractRule); + valueExtractor.valueExtractor = + new CoreMapFunctionApplier< List, Value >( + env, r.annotationField, + new SequencePatternExtractRule( + pattern, + new SequenceMatchResultExtractor(env, action, result), r.matchFindType, r.matchWithResults)); + r.extractRule = new CoreMapExtractRule< List, MatchedExpression >( + env, r.annotationField, + new SequencePatternExtractRule(pattern, + new SequenceMatchedExpressionExtractor( valueExtractor, r.matchedExpressionGroup), r.matchFindType, r.matchWithResults)); } else { - annotationExtractor.valueExtractor = new CoreMapToListFunctionApplier<>(env, valueExtractRule); - r.extractRule = new CoreMapToListExtractRule<>(exprExtractRule); + valueExtractor.valueExtractor = + new CoreMapToListFunctionApplier< Value >( + env, new SequencePatternExtractRule( + pattern, + new SequenceMatchResultExtractor(env, action, result), r.matchFindType, r.matchWithResults)); + r.extractRule = new CoreMapToListExtractRule< MatchedExpression >( + new SequencePatternExtractRule(pattern, + new SequenceMatchedExpressionExtractor( valueExtractor, r.matchedExpressionGroup), r.matchFindType, r.matchWithResults)); + } - r.filterRule = new AnnotationMatchedFilter(annotationExtractor); + r.filterRule = new AnnotationMatchedFilter(valueExtractor); r.pattern = pattern; r.result = result; pattern.weight = r.weight; @@ -563,26 +563,30 @@ protected void updateExtractRule(AnnotationExtractRule r, Expression action, Expression result) { - MatchedExpression.SingleAnnotationExtractor annotationExtractor = createAnnotationExtractor(env, r); - SequenceMatchResultExtractor valueExtractor = new SequenceMatchResultExtractor<>(env, action, result); - MultiSequencePatternExtractRule valueExtractRule = new MultiSequencePatternExtractRule<>(pattern, valueExtractor); - SequenceMatchedExpressionExtractor exprExtractor = new SequenceMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup ); - MultiSequencePatternExtractRule exprExtractRule = - new MultiSequencePatternExtractRule<>(pattern, exprExtractor); - - annotationExtractor.expressionToValue = matched -> { - if (matched != null && matched.context != null && matched.context instanceof SequenceMatchResult ) { - return valueExtractor.apply( (SequenceMatchResult) matched.context); - } else return null; - }; + MatchedExpression.SingleAnnotationExtractor valueExtractor = createAnnotationExtractor(env, r); if (r.annotationField != null && r.annotationField != CoreMap.class) { - annotationExtractor.valueExtractor = new CoreMapFunctionApplier<>(env, r.annotationField, valueExtractRule); - r.extractRule = new CoreMapExtractRule<>(env, r.annotationField, exprExtractRule); + valueExtractor.valueExtractor = + new CoreMapFunctionApplier< List, Value >( + env, r.annotationField, + new MultiSequencePatternExtractRule( + pattern, + new SequenceMatchResultExtractor(env, action, result))); + r.extractRule = new CoreMapExtractRule< List, MatchedExpression >( + env, r.annotationField, + new MultiSequencePatternExtractRule(pattern, + new SequenceMatchedExpressionExtractor( valueExtractor, r.matchedExpressionGroup))); } else { - annotationExtractor.valueExtractor = new CoreMapToListFunctionApplier<>(env, valueExtractRule); - r.extractRule = new CoreMapToListExtractRule<>(exprExtractRule); + valueExtractor.valueExtractor = + new CoreMapToListFunctionApplier< Value >( + env, new MultiSequencePatternExtractRule( + pattern, + new SequenceMatchResultExtractor(env, action, result))); + r.extractRule = new CoreMapToListExtractRule< MatchedExpression >( + new MultiSequencePatternExtractRule(pattern, + new SequenceMatchedExpressionExtractor( valueExtractor, r.matchedExpressionGroup))); + } - r.filterRule = new AnnotationMatchedFilter(annotationExtractor); + r.filterRule = new AnnotationMatchedFilter(valueExtractor); r.pattern = pattern; r.result = result; } @@ -628,16 +632,19 @@ protected void updateExtractRule(AnnotationExtractRule r, Expression action, Expression result) { - final MatchedExpression.SingleAnnotationExtractor annotationExtractor = createAnnotationExtractor(env, r); + final MatchedExpression.SingleAnnotationExtractor valueExtractor = createAnnotationExtractor(env, r); Pattern pattern = env.getStringPattern(expr); - StringMatchResultExtractor valueExtractor = new StringMatchResultExtractor(env, action, result); - StringPatternExtractRule valueExtractRule = new StringPatternExtractRule<>(pattern, valueExtractor); - StringMatchedExpressionExtractor exprExtractor = new StringMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup ); - StringPatternExtractRule exprExtractRule = new StringPatternExtractRule<>(pattern, exprExtractor); - - annotationExtractor.valueExtractor = new CoreMapFunctionApplier<>(env, r.annotationField, valueExtractRule); - r.extractRule = new CoreMapExtractRule<>(env, r.annotationField, exprExtractRule); - r.filterRule = new AnnotationMatchedFilter(annotationExtractor); + valueExtractor.valueExtractor = + new CoreMapFunctionApplier< String, Value >( + env, r.annotationField, + new StringPatternExtractRule( + pattern, + new StringMatchResultExtractor(env, action, result))); + r.extractRule = new CoreMapExtractRule< String, MatchedExpression >( + env, r.annotationField, + new StringPatternExtractRule(pattern, + new StringMatchedExpressionExtractor( valueExtractor, r.matchedExpressionGroup))); + r.filterRule = new AnnotationMatchedFilter(valueExtractor); r.pattern = pattern; r.result = result; } @@ -1071,10 +1078,6 @@ public MatchedExpression apply(SequenceMatchResult matched) { if (Double.isNaN(te.weight)) { te.weight = matched.score(); } - if (this.group != 0) { - // Save context so value evaluation can happen - te.context = matched.toBasicSequenceMatchResult(); - } return te; } } diff --git a/src/edu/stanford/nlp/ling/tokensregex/SequencePattern.java b/src/edu/stanford/nlp/ling/tokensregex/SequencePattern.java index 59ea36e5d4..d8e79845f2 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/SequencePattern.java +++ b/src/edu/stanford/nlp/ling/tokensregex/SequencePattern.java @@ -94,8 +94,8 @@ public class SequencePattern implements Serializable { // 1. Validate backref capture groupid // 2. Actions // 3. Inconsistent templating with T - // 4. Update TokensSequenceParser to handle backref of other attributes (\9{attr1,attr2,...}) - // 5. Improve nested capture groups (in matchresult) for other node types such as conjunctions/disjunctions + // 4. Match sequence begin/end (update TokensSequenceParser to map ^ => SEQ_BEGIN_PATTERN_EXPR, and $ to SEQ_END_PATTERN_EXPR) + // 5. Update TokensSequenceParser to handle backref of other attributes (\9{attr1,attr2,...}) private String patternStr; private PatternExpr patternExpr; private SequenceMatchAction action; @@ -227,7 +227,7 @@ public OUT findNodePattern(Function, OUT> filter) { return null; } - public Collection findNodePatterns(Function, OUT> filter, boolean allowOptional, boolean allowBranching) { + public Collection findNodePatterns(Function, OUT> filter) { List outList = new ArrayList(); Queue todo = new LinkedList(); Set seen = new HashSet(); @@ -235,7 +235,7 @@ public Collection findNodePatterns(Function, OUT> filt seen.add(root); while (!todo.isEmpty()) { State state = todo.poll(); - if ((allowOptional || !state.isOptional) && (state instanceof NodePatternState)) { + if (state instanceof NodePatternState) { NodePattern pattern = ((NodePatternState) state).pattern; OUT res = filter.apply(pattern); if (res != null) { @@ -243,14 +243,8 @@ public Collection findNodePatterns(Function, OUT> filt } } if (state.next != null) { - boolean addNext = allowBranching || state.next.size() == 1; - if (addNext) { - for (State s : state.next) { - if (!seen.contains(s)) { - seen.add(s); - todo.add(s); - } - } + for (State s: state.next) { + if (!seen.contains(s)) { seen.add(s); todo.add(s); } } } } @@ -797,9 +791,6 @@ protected Frag build() f.add(curOut); } } - if (minMatch == 0) { - f.start.markOptional(true); - } return f; } else { // More general but more expensive matching (when branching, need to keep state explicitly) @@ -881,7 +872,6 @@ protected Frag build() // Add child NFA out (unlinked) states to out (unlinked) states of this fragment frag.add(f.out); } - frag.start.markOptional(true); return frag; } @@ -1191,7 +1181,6 @@ static class State { */ Set next; boolean hasSavedValue; - boolean isOptional; // is this state optional protected State() {} @@ -1284,27 +1273,6 @@ public Object value(int bid, SequenceMatcher.MatchedStates matchedStates) } return null; } - - public void markOptional(boolean propagate) { - this.isOptional = true; - if (propagate && next != null) { - Stack todo = new Stack(); - Set seen = new HashSet(); - todo.addAll(next); - while (!todo.empty()) { - State s = todo.pop(); - s.isOptional = true; - seen.add(s); - if (next != null) { - for (State n : next) { - if (!seen.contains(n)) { - todo.push(n); - } - } - } - } - } - } } /** @@ -1404,15 +1372,6 @@ protected boolean match(int bid, SequenceMatcher.MatchedStates matchedSta List nodes = matchedStates.elements(); // TODO: Fix type checking Collection> matched = pattern.match(nodes, cur); - // Order matches - if (pattern.isGreedyMatch()) { - // Sort from long to short - matched = CollectionUtils.sorted(matched, Interval.LENGTH_GT_COMPARATOR); - } else { - // Sort from short to long - matched = CollectionUtils.sorted(matched, Interval.LENGTH_LT_COMPARATOR); - } - // TODO: Check intervals are valid? Start at cur and ends after? if (matched != null && matched.size() > 0) { int nBranches = matched.size(); @@ -1474,7 +1433,6 @@ public RepeatState(State start, int minMatch, int maxMatch, boolean greedyMatch) if (maxMatch >= 0 && minMatch > maxMatch) { throw new IllegalArgumentException("Invalid minMatch=" + minMatch + ", maxMatch=" + maxMatch); } - this.isOptional = this.minMatch <= 0; } @Override diff --git a/src/edu/stanford/nlp/ling/tokensregex/parser/TokenSequenceParser.java b/src/edu/stanford/nlp/ling/tokensregex/parser/TokenSequenceParser.java index 5b081977c9..94834ef3c8 100644 --- a/src/edu/stanford/nlp/ling/tokensregex/parser/TokenSequenceParser.java +++ b/src/edu/stanford/nlp/ling/tokensregex/parser/TokenSequenceParser.java @@ -859,34 +859,8 @@ final public String StringRegex(Env env) throws ParseException { final public SequencePattern.PatternExpr SeqRegex(Env env) throws ParseException { SequencePattern.PatternExpr expr; - boolean hasStart = false; - boolean hasEnd = false; - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 40: - jj_consume_token(40); - hasStart = true; - break; - default: - jj_la1[24] = jj_gen; - ; - } expr = SeqRegexDisjConj(env); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 41: - jj_consume_token(41); - hasEnd = true; - break; - default: - jj_la1[25] = jj_gen; - ; - } - if (hasStart) { - expr = new SequencePattern.SequencePatternExpr(SequencePattern.SEQ_BEGIN_PATTERN_EXPR, expr); - } - if (hasEnd) { - expr = new SequencePattern.SequencePatternExpr(expr, SequencePattern.SEQ_END_PATTERN_EXPR); - } - {if (true) return expr;} + {if (true) return expr;} throw new Error("Missing return statement in function"); } @@ -911,7 +885,7 @@ final public Object StringNumberValue(Env env) throws ParseException { {if (true) return Double.valueOf(tok.image);} break; default: - jj_la1[26] = jj_gen; + jj_la1[24] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -942,7 +916,7 @@ final public SequencePattern.PatternExpr SeqRegexBasic(Env env) throws ParseExce node = CoreMapWordPattern(env); expr = new SequencePattern.NodePatternExpr(node); break; - case 52: + case 50: multiNode = MultiNodePattern(env); expr = new SequencePattern.MultiNodePatternExpr(multiNode); break; @@ -956,19 +930,19 @@ final public SequencePattern.PatternExpr SeqRegexBasic(Env env) throws ParseExce expr = SeqBackRef(env); break; default: - jj_la1[27] = jj_gen; + jj_la1[25] = jj_gen; jj_consume_token(-1); throw new ParseException(); } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case 22: + case 40: + case 41: case 42: - case 43: - case 44: expr = SeqRegexRepeatTimes(env, expr); break; default: - jj_la1[28] = jj_gen; + jj_la1[26] = jj_gen; ; } children.add(expr); @@ -985,11 +959,11 @@ final public SequencePattern.PatternExpr SeqRegexBasic(Env env) throws ParseExce case STRSIMPLE: case 25: case 31: - case 52: + case 50: ; break; default: - jj_la1[29] = jj_gen; + jj_la1[27] = jj_gen; break label_12; } } @@ -999,7 +973,7 @@ final public SequencePattern.PatternExpr SeqRegexBasic(Env env) throws ParseExce value = Expression(env); break; default: - jj_la1[30] = jj_gen; + jj_la1[28] = jj_gen; ; } if (children.size() != 1) { @@ -1019,20 +993,20 @@ final public SequencePattern.PatternExpr SeqRegexRepeatTimes(Env env, SequencePa int max = -1; boolean greedy = true; switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 42: - value = jj_consume_token(42); + case 40: + value = jj_consume_token(40); min = 0; max = -1; break; - case 43: - value = jj_consume_token(43); + case 41: + value = jj_consume_token(41); min = 0; max = 1; break; - case 44: - value = jj_consume_token(44); + case 42: + value = jj_consume_token(42); min = 1; max = -1; break; default: - jj_la1[31] = jj_gen; + jj_la1[29] = jj_gen; if (jj_2_27(3)) { jj_consume_token(22); value = jj_consume_token(NONNEGINT); @@ -1055,19 +1029,19 @@ final public SequencePattern.PatternExpr SeqRegexRepeatTimes(Env env, SequencePa min = Integer.parseInt(value.image); max = Integer.parseInt(v2.image); break; default: - jj_la1[32] = jj_gen; + jj_la1[30] = jj_gen; jj_consume_token(-1); throw new ParseException(); } } } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 43: - jj_consume_token(43); + case 41: + jj_consume_token(41); greedy = false; break; default: - jj_la1[33] = jj_gen; + jj_la1[31] = jj_gen; ; } {if (true) return new SequencePattern.RepeatPatternExpr(expr, min, max, greedy);} @@ -1083,22 +1057,22 @@ final public SequencePattern.PatternExpr SeqRegexDisj(Env env) throws ParseExcep while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case 37: - case 45: + case 43: ; break; default: - jj_la1[34] = jj_gen; + jj_la1[32] = jj_gen; break label_13; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 45: - jj_consume_token(45); + case 43: + jj_consume_token(43); break; case 37: jj_consume_token(37); break; default: - jj_la1[35] = jj_gen; + jj_la1[33] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1122,20 +1096,20 @@ final public SequencePattern.PatternExpr SeqRegexDisjConj(Env env) throws ParseE switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case 36: case 37: - case 45: - case 46: + case 43: + case 44: ; break; default: - jj_la1[36] = jj_gen; + jj_la1[34] = jj_gen; break label_14; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 46: - op = jj_consume_token(46); + case 44: + op = jj_consume_token(44); break; - case 45: - op = jj_consume_token(45); + case 43: + op = jj_consume_token(43); break; case 36: op = jj_consume_token(36); @@ -1144,7 +1118,7 @@ final public SequencePattern.PatternExpr SeqRegexDisjConj(Env env) throws ParseE op = jj_consume_token(37); break; default: - jj_la1[37] = jj_gen; + jj_la1[35] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1180,26 +1154,26 @@ final public SequencePattern.PatternExpr SeqRegexGroup(Env env) throws ParseExce Token var; jj_consume_token(25); switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 43: - case 47: + case 41: + case 45: switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 47: - jj_consume_token(47); + case 45: + jj_consume_token(45); capturing = false; break; - case 43: - jj_consume_token(43); + case 41: + jj_consume_token(41); var = jj_consume_token(REGEXVAR); varname = var.image; break; default: - jj_la1[38] = jj_gen; + jj_la1[36] = jj_gen; jj_consume_token(-1); throw new ParseException(); } break; default: - jj_la1[39] = jj_gen; + jj_la1[37] = jj_gen; ; } expr = SeqRegex(env); @@ -1226,7 +1200,7 @@ final public NodePattern BracketedNode(Env env) throws ParseException { jj_consume_token(32); break; default: - jj_la1[40] = jj_gen; + jj_la1[38] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1267,12 +1241,12 @@ final public NodePattern Node(Env env) throws ParseException { case STR: case 22: case 25: - case 48: - case 49: + case 46: + case 47: node = NodeGroup(env); break; default: - jj_la1[41] = jj_gen; + jj_la1[39] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1289,22 +1263,22 @@ final public NodePattern NodeDisj(Env env) throws ParseException { while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case 37: - case 45: + case 43: ; break; default: - jj_la1[42] = jj_gen; + jj_la1[40] = jj_gen; break label_15; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 45: - jj_consume_token(45); + case 43: + jj_consume_token(43); break; case 37: jj_consume_token(37); break; default: - jj_la1[43] = jj_gen; + jj_la1[41] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1327,22 +1301,22 @@ final public NodePattern NodeConj(Env env) throws ParseException { while (true) { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case 36: - case 46: + case 44: ; break; default: - jj_la1[44] = jj_gen; + jj_la1[42] = jj_gen; break label_16; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 46: - jj_consume_token(46); + case 44: + jj_consume_token(44); break; case 36: jj_consume_token(36); break; default: - jj_la1[45] = jj_gen; + jj_la1[43] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1368,20 +1342,20 @@ final public NodePattern NodeDisjConj(Env env) throws ParseException { switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case 36: case 37: - case 45: - case 46: + case 43: + case 44: ; break; default: - jj_la1[46] = jj_gen; + jj_la1[44] = jj_gen; break label_17; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 46: - op = jj_consume_token(46); + case 44: + op = jj_consume_token(44); break; - case 45: - op = jj_consume_token(45); + case 43: + op = jj_consume_token(43); break; case 36: op = jj_consume_token(36); @@ -1390,7 +1364,7 @@ final public NodePattern NodeDisjConj(Env env) throws ParseException { op = jj_consume_token(37); break; default: - jj_la1[47] = jj_gen; + jj_la1[45] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1428,7 +1402,7 @@ final public NodePattern NodeGroup(Env env) throws ParseException { node = NodeDisjConj(env); jj_consume_token(26); } else if (jj_2_32(2)) { - jj_consume_token(48); + jj_consume_token(46); jj_consume_token(25); node = NodeDisjConj(env); jj_consume_token(26); @@ -1444,8 +1418,8 @@ final public NodePattern NodeGroup(Env env) throws ParseException { final public NodePattern NodeBasic(Env env) throws ParseException { NodePattern child; switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 48: - jj_consume_token(48); + case 46: + jj_consume_token(46); child = CoreMapNode(env); {if (true) return new NodePattern.NegateNodePattern(child);} break; @@ -1454,12 +1428,12 @@ final public NodePattern NodeBasic(Env env) throws ParseException { case REGEX: case STR: case 22: - case 49: + case 47: child = CoreMapNode(env); {if (true) return child;} break; default: - jj_la1[48] = jj_gen; + jj_la1[46] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1487,7 +1461,7 @@ final public NodePattern CoreMapNode(Env env) throws ParseException { ; break; default: - jj_la1[49] = jj_gen; + jj_la1[47] = jj_gen; break label_18; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { @@ -1498,7 +1472,7 @@ final public NodePattern CoreMapNode(Env env) throws ParseException { jj_consume_token(30); break; default: - jj_la1[50] = jj_gen; + jj_la1[48] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1507,7 +1481,7 @@ final public NodePattern CoreMapNode(Env env) throws ParseException { jj_consume_token(24); break; default: - jj_la1[52] = jj_gen; + jj_la1[50] = jj_gen; if (jj_2_33(2)) { AttrValue(env, attributes); } else { @@ -1524,14 +1498,14 @@ final public NodePattern CoreMapNode(Env env) throws ParseException { value = jj_consume_token(REGEX); break; default: - jj_la1[51] = jj_gen; + jj_la1[49] = jj_gen; jj_consume_token(-1); throw new ParseException(); } attributes.put("word", value.image); break; default: - jj_la1[53] = jj_gen; + jj_la1[51] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1544,14 +1518,14 @@ final public NodePattern CoreMapNode(Env env) throws ParseException { pat = CoreMapVarNodePattern(env); {if (true) return pat;} break; - case 49: - jj_consume_token(49); + case 47: + jj_consume_token(47); pat = CoreMapExprNodePattern(env); - jj_consume_token(50); + jj_consume_token(48); {if (true) return pat;} break; default: - jj_la1[54] = jj_gen; + jj_la1[52] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1581,13 +1555,13 @@ final public Map AttrValue(Env env, Map attributes str = CoreMapVarValue(env); break; default: - jj_la1[55] = jj_gen; + jj_la1[53] = jj_gen; jj_consume_token(-1); throw new ParseException(); } break; - case 51: - tok = jj_consume_token(51); + case 49: + tok = jj_consume_token(49); value = jj_consume_token(IDENTIFIER); break; case NUMCMP: @@ -1602,13 +1576,13 @@ final public Map AttrValue(Env env, Map attributes str = CoreMapVarValue(env); break; default: - jj_la1[56] = jj_gen; + jj_la1[54] = jj_gen; jj_consume_token(-1); throw new ParseException(); } break; default: - jj_la1[57] = jj_gen; + jj_la1[55] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1654,7 +1628,7 @@ final public NodePattern CoreMapWordPattern(Env env) throws ParseException { value = jj_consume_token(STRSIMPLE); break; default: - jj_la1[58] = jj_gen; + jj_la1[56] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1669,8 +1643,7 @@ final public MultiNodePattern MultiNodePattern(Env env) throws ParseException { MultiNodePattern mp; Token v1, v2; int min = 1, max = -1; - boolean greedy = true; - jj_consume_token(52); + jj_consume_token(50); switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { case 22: if (jj_2_34(3)) { @@ -1697,30 +1670,20 @@ final public MultiNodePattern MultiNodePattern(Env env) throws ParseException { max = Integer.parseInt(v2.image); break; default: - jj_la1[59] = jj_gen; + jj_la1[57] = jj_gen; jj_consume_token(-1); throw new ParseException(); } } break; default: - jj_la1[60] = jj_gen; - ; - } - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 43: - jj_consume_token(43); - greedy = false; - break; - default: - jj_la1[61] = jj_gen; + jj_la1[58] = jj_gen; ; } pat = CoreMapWordPattern(env); mp = new MultiCoreMapNodePattern(pat); mp.setMinNodes(min); mp.setMaxNodes(max); - mp.setGreedyMatch(greedy); {if (true) return mp;} throw new Error("Missing return statement in function"); } @@ -1751,11 +1714,11 @@ final public Pair> Seq SequenceMatchAction action = null; expr = SeqRegex(env); switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 53: + case 51: action = Action(env); break; default: - jj_la1[62] = jj_gen; + jj_la1[59] = jj_gen; ; } {if (true) return new Pair>(expr,action);} @@ -1764,7 +1727,7 @@ final public Pair> Seq final public SequenceMatchAction Action(Env env) throws ParseException { SequenceMatchAction action; - jj_consume_token(53); + jj_consume_token(51); action = AnnotateAction(env); {if (true) return action;} throw new Error("Missing return statement in function"); @@ -1772,7 +1735,7 @@ final public SequenceMatchAction Action(Env env) throws ParseException final public SequenceMatchAction AnnotateAction(Env env) throws ParseException { Map attributes; - jj_consume_token(54); + jj_consume_token(52); jj_consume_token(25); // group = "," attributes = SetAttrValues(env); @@ -1793,7 +1756,7 @@ final public Map SetAttrValues(Env env) throws ParseException { ; break; default: - jj_la1[63] = jj_gen; + jj_la1[60] = jj_gen; break label_19; } switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { @@ -1804,7 +1767,7 @@ final public Map SetAttrValues(Env env) throws ParseException { jj_consume_token(30); break; default: - jj_la1[64] = jj_gen; + jj_la1[61] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1832,7 +1795,7 @@ final public Map SetAttrValue(Env env, Map attribu value = NumberToken(); break; default: - jj_la1[65] = jj_gen; + jj_la1[62] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1860,7 +1823,7 @@ final public Token NumberToken() throws ParseException { value = jj_consume_token(REAL); break; default: - jj_la1[66] = jj_gen; + jj_la1[63] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1878,7 +1841,7 @@ final public Token IntegerToken() throws ParseException { value = jj_consume_token(INT); break; default: - jj_la1[67] = jj_gen; + jj_la1[64] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1896,7 +1859,7 @@ final public Token CmpToken() throws ParseException { value = jj_consume_token(NUMCMP); break; default: - jj_la1[68] = jj_gen; + jj_la1[65] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1914,7 +1877,7 @@ final public Token RelaxedStringToken() throws ParseException { value = jj_consume_token(IDENTIFIER); break; default: - jj_la1[69] = jj_gen; + jj_la1[66] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -1934,7 +1897,7 @@ final public String RelaxedString() throws ParseException { {if (true) return value.image;} break; default: - jj_la1[70] = jj_gen; + jj_la1[67] = jj_gen; jj_consume_token(-1); throw new ParseException(); } @@ -2209,7 +2172,7 @@ private boolean jj_3_14() { return false; } - private boolean jj_3R_73() { + private boolean jj_3R_72() { Token xsp; xsp = jj_scanpos; if (jj_3_14()) { @@ -2222,88 +2185,81 @@ private boolean jj_3R_73() { return false; } - private boolean jj_3R_51() { - if (jj_scan_token(IDENTIFIER)) return true; + private boolean jj_3R_114() { + if (jj_scan_token(31)) return true; + if (jj_3R_36()) return true; + if (jj_scan_token(32)) return true; return false; } - private boolean jj_3R_129() { - if (jj_scan_token(43)) return true; - if (jj_scan_token(REGEXVAR)) return true; + private boolean jj_3_29() { + if (jj_scan_token(31)) return true; + if (jj_scan_token(32)) return true; return false; } - private boolean jj_3R_50() { - if (jj_scan_token(STR)) return true; + private boolean jj_3R_60() { + if (jj_3R_71()) return true; + Token xsp; + while (true) { + xsp = jj_scanpos; + if (jj_3R_72()) { jj_scanpos = xsp; break; } + } return false; } - private boolean jj_3R_32() { + private boolean jj_3R_107() { Token xsp; xsp = jj_scanpos; - if (jj_3R_50()) { + if (jj_3_29()) { jj_scanpos = xsp; - if (jj_3R_51()) return true; + if (jj_3R_114()) return true; } return false; } - private boolean jj_3R_60() { - if (jj_3R_72()) return true; - Token xsp; - while (true) { - xsp = jj_scanpos; - if (jj_3R_73()) { jj_scanpos = xsp; break; } - } + private boolean jj_3R_89() { + if (jj_3R_31()) return true; return false; } - private boolean jj_3R_128() { - if (jj_scan_token(47)) return true; + private boolean jj_3R_126() { + if (jj_scan_token(41)) return true; + if (jj_scan_token(REGEXVAR)) return true; return false; } - private boolean jj_3R_121() { - Token xsp; - xsp = jj_scanpos; - if (jj_3R_128()) { - jj_scanpos = xsp; - if (jj_3R_129()) return true; - } + private boolean jj_3R_51() { + if (jj_scan_token(IDENTIFIER)) return true; return false; } - private boolean jj_3R_112() { - if (jj_scan_token(25)) return true; - Token xsp; - xsp = jj_scanpos; - if (jj_3R_121()) jj_scanpos = xsp; - if (jj_3R_59()) return true; - if (jj_scan_token(26)) return true; + private boolean jj_3R_125() { + if (jj_scan_token(45)) return true; return false; } - private boolean jj_3R_108() { + private boolean jj_3R_118() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(18)) { + if (jj_3R_125()) { jj_scanpos = xsp; - if (jj_scan_token(7)) return true; + if (jj_3R_126()) return true; } return false; } - private boolean jj_3R_90() { - if (jj_3R_31()) return true; + private boolean jj_3R_50() { + if (jj_scan_token(STR)) return true; return false; } - private boolean jj_3R_52() { + private boolean jj_3R_32() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(20)) { + if (jj_3R_50()) { jj_scanpos = xsp; - if (jj_scan_token(19)) return true; + if (jj_3R_51()) return true; } return false; } @@ -2313,49 +2269,46 @@ private boolean jj_3R_33() { if (jj_scan_token(25)) return true; Token xsp; xsp = jj_scanpos; - if (jj_3R_90()) jj_scanpos = xsp; + if (jj_3R_89()) jj_scanpos = xsp; if (jj_scan_token(26)) return true; return false; } - private boolean jj_3R_49() { - Token xsp; - xsp = jj_scanpos; - if (jj_scan_token(13)) { - jj_scanpos = xsp; - if (jj_scan_token(14)) return true; - } - return false; - } - private boolean jj_3R_66() { if (jj_scan_token(REGEXMRGROUP)) return true; return false; } - private boolean jj_3R_65() { - if (jj_scan_token(REGEXMRVAR)) return true; + private boolean jj_3R_110() { + if (jj_scan_token(25)) return true; + Token xsp; + xsp = jj_scanpos; + if (jj_3R_118()) jj_scanpos = xsp; + if (jj_3R_59()) return true; + if (jj_scan_token(26)) return true; return false; } - private boolean jj_3R_64() { - if (jj_scan_token(REGEXGROUP)) return true; + private boolean jj_3R_65() { + if (jj_scan_token(REGEXMRVAR)) return true; return false; } - private boolean jj_3R_135() { + private boolean jj_3R_106() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(13)) { - jj_scanpos = xsp; - if (jj_scan_token(14)) { + if (jj_scan_token(18)) { jj_scanpos = xsp; - if (jj_scan_token(16)) return true; - } + if (jj_scan_token(7)) return true; } return false; } + private boolean jj_3R_64() { + if (jj_scan_token(REGEXGROUP)) return true; + return false; + } + private boolean jj_3R_63() { if (jj_scan_token(REGEXVAR)) return true; return false; @@ -2385,76 +2338,99 @@ private boolean jj_3R_62() { return false; } - private boolean jj_3R_107() { + private boolean jj_3R_52() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(46)) { - jj_scanpos = xsp; - if (jj_scan_token(45)) { - jj_scanpos = xsp; - if (jj_scan_token(36)) { + if (jj_scan_token(20)) { jj_scanpos = xsp; - if (jj_scan_token(37)) return true; - } - } + if (jj_scan_token(19)) return true; } - if (jj_3R_87()) return true; return false; } - private boolean jj_3R_89() { + private boolean jj_3R_88() { if (jj_scan_token(REGEXVAR)) return true; return false; } - private boolean jj_3R_72() { + private boolean jj_3R_71() { Token xsp; xsp = jj_scanpos; - if (jj_3R_88()) { + if (jj_3R_87()) { jj_scanpos = xsp; - if (jj_3R_89()) return true; + if (jj_3R_88()) return true; } return false; } - private boolean jj_3R_88() { + private boolean jj_3R_87() { if (jj_scan_token(IDENTIFIER)) return true; return false; } - private boolean jj_3R_71() { - if (jj_3R_87()) return true; + private boolean jj_3R_49() { Token xsp; - while (true) { - xsp = jj_scanpos; - if (jj_3R_107()) { jj_scanpos = xsp; break; } + xsp = jj_scanpos; + if (jj_scan_token(13)) { + jj_scanpos = xsp; + if (jj_scan_token(14)) return true; } return false; } - private boolean jj_3R_80() { + private boolean jj_3R_132() { + Token xsp; + xsp = jj_scanpos; + if (jj_scan_token(13)) { + jj_scanpos = xsp; + if (jj_scan_token(14)) { + jj_scanpos = xsp; + if (jj_scan_token(16)) return true; + } + } + return false; + } + + private boolean jj_3R_79() { if (jj_scan_token(25)) return true; if (jj_3R_59()) return true; if (jj_scan_token(26)) return true; return false; } - private boolean jj_3R_79() { + private boolean jj_3R_78() { if (jj_scan_token(REAL)) return true; return false; } - private boolean jj_3R_78() { - if (jj_scan_token(LONGINT)) return true; + private boolean jj_3R_105() { + Token xsp; + xsp = jj_scanpos; + if (jj_scan_token(44)) { + jj_scanpos = xsp; + if (jj_scan_token(43)) { + jj_scanpos = xsp; + if (jj_scan_token(36)) { + jj_scanpos = xsp; + if (jj_scan_token(37)) return true; + } + } + } + if (jj_3R_86()) return true; return false; } private boolean jj_3R_77() { - if (jj_3R_49()) return true; + if (jj_scan_token(LONGINT)) return true; return false; } private boolean jj_3R_76() { + if (jj_3R_49()) return true; + return false; + } + + private boolean jj_3R_75() { if (jj_scan_token(STR)) return true; return false; } @@ -2462,6 +2438,8 @@ private boolean jj_3R_76() { private boolean jj_3R_67() { Token xsp; xsp = jj_scanpos; + if (jj_3R_74()) { + jj_scanpos = xsp; if (jj_3R_75()) { jj_scanpos = xsp; if (jj_3R_76()) { @@ -2470,9 +2448,7 @@ private boolean jj_3R_67() { jj_scanpos = xsp; if (jj_3R_78()) { jj_scanpos = xsp; - if (jj_3R_79()) { - jj_scanpos = xsp; - if (jj_3R_80()) return true; + if (jj_3R_79()) return true; } } } @@ -2481,24 +2457,22 @@ private boolean jj_3R_67() { return false; } - private boolean jj_3R_75() { + private boolean jj_3R_74() { if (jj_scan_token(REGEX)) return true; return false; } - private boolean jj_3R_126() { - if (jj_scan_token(43)) return true; - return false; - } - - private boolean jj_3R_125() { - if (jj_scan_token(22)) return true; - if (jj_scan_token(NONNEGINT)) return true; - if (jj_scan_token(33)) return true; + private boolean jj_3R_70() { + if (jj_3R_86()) return true; + Token xsp; + while (true) { + xsp = jj_scanpos; + if (jj_3R_105()) { jj_scanpos = xsp; break; } + } return false; } - private boolean jj_3R_81() { + private boolean jj_3R_80() { Token xsp; xsp = jj_scanpos; if (jj_scan_token(33)) { @@ -2515,6 +2489,18 @@ private boolean jj_3R_39() { return false; } + private boolean jj_3R_123() { + if (jj_scan_token(41)) return true; + return false; + } + + private boolean jj_3R_122() { + if (jj_scan_token(22)) return true; + if (jj_scan_token(NONNEGINT)) return true; + if (jj_scan_token(33)) return true; + return false; + } + private boolean jj_3_28() { if (jj_scan_token(22)) return true; if (jj_scan_token(NONNEGINT)) return true; @@ -2523,8 +2509,8 @@ private boolean jj_3_28() { return false; } - private boolean jj_3R_124() { - if (jj_scan_token(44)) return true; + private boolean jj_3R_121() { + if (jj_scan_token(42)) return true; return false; } @@ -2535,115 +2521,121 @@ private boolean jj_3_27() { return false; } - private boolean jj_3R_123() { - if (jj_scan_token(43)) return true; + private boolean jj_3R_120() { + if (jj_scan_token(41)) return true; return false; } - private boolean jj_3R_122() { - if (jj_scan_token(42)) return true; + private boolean jj_3R_22() { + if (jj_scan_token(22)) return true; + if (jj_3R_39()) return true; + Token xsp; + while (true) { + xsp = jj_scanpos; + if (jj_3R_80()) { jj_scanpos = xsp; break; } + } + if (jj_scan_token(24)) return true; return false; } - private boolean jj_3R_118() { + private boolean jj_3R_119() { + if (jj_scan_token(40)) return true; + return false; + } + + private boolean jj_3R_116() { Token xsp; xsp = jj_scanpos; - if (jj_3R_122()) { + if (jj_3R_119()) { jj_scanpos = xsp; - if (jj_3R_123()) { + if (jj_3R_120()) { jj_scanpos = xsp; - if (jj_3R_124()) { + if (jj_3R_121()) { jj_scanpos = xsp; if (jj_3_27()) { jj_scanpos = xsp; if (jj_3_28()) { jj_scanpos = xsp; - if (jj_3R_125()) return true; + if (jj_3R_122()) return true; } } } } } xsp = jj_scanpos; - if (jj_3R_126()) jj_scanpos = xsp; - return false; - } - - private boolean jj_3R_22() { - if (jj_scan_token(22)) return true; - if (jj_3R_39()) return true; - Token xsp; - while (true) { - xsp = jj_scanpos; - if (jj_3R_81()) { jj_scanpos = xsp; break; } - } - if (jj_scan_token(24)) return true; + if (jj_3R_123()) jj_scanpos = xsp; return false; } - private boolean jj_3R_96() { - if (jj_3R_68()) return true; + private boolean jj_3R_45() { + if (jj_3R_22()) return true; return false; } - private boolean jj_3R_115() { - if (jj_scan_token(23)) return true; - if (jj_3R_31()) return true; + private boolean jj_3R_44() { + if (jj_3R_67()) return true; return false; } - private boolean jj_3R_95() { - if (jj_scan_token(REGEXVAR)) return true; + private boolean jj_3R_26() { + Token xsp; + xsp = jj_scanpos; + if (jj_3R_44()) { + jj_scanpos = xsp; + if (jj_3R_45()) return true; + } return false; } - private boolean jj_3R_117() { - if (jj_3R_118()) return true; + private boolean jj_3R_113() { + if (jj_scan_token(23)) return true; + if (jj_3R_31()) return true; return false; } - private boolean jj_3R_134() { - if (jj_scan_token(REGEXVAR)) return true; + private boolean jj_3R_95() { + if (jj_3R_68()) return true; return false; } - private boolean jj_3R_45() { - if (jj_3R_22()) return true; + private boolean jj_3R_73() { + if (jj_scan_token(33)) return true; + if (jj_3R_31()) return true; return false; } - private boolean jj_3R_131() { - if (jj_3R_134()) return true; + private boolean jj_3R_61() { + if (jj_3R_31()) return true; + Token xsp; + while (true) { + xsp = jj_scanpos; + if (jj_3R_73()) { jj_scanpos = xsp; break; } + } return false; } - private boolean jj_3R_106() { - if (jj_3R_114()) return true; + private boolean jj_3R_94() { + if (jj_scan_token(REGEXVAR)) return true; return false; } - private boolean jj_3R_44() { - if (jj_3R_67()) return true; + private boolean jj_3R_115() { + if (jj_3R_116()) return true; return false; } - private boolean jj_3R_105() { - if (jj_3R_113()) return true; + private boolean jj_3R_128() { + if (jj_3R_131()) return true; return false; } - private boolean jj_3R_26() { - Token xsp; - xsp = jj_scanpos; - if (jj_3R_44()) { - jj_scanpos = xsp; - if (jj_3R_45()) return true; - } + private boolean jj_3R_104() { + if (jj_3R_112()) return true; return false; } - private boolean jj_3R_104() { - if (jj_3R_112()) return true; + private boolean jj_3R_131() { + if (jj_scan_token(REGEXVAR)) return true; return false; } @@ -2652,9 +2644,13 @@ private boolean jj_3R_103() { return false; } - private boolean jj_3R_74() { - if (jj_scan_token(33)) return true; - if (jj_3R_31()) return true; + private boolean jj_3R_40() { + if (jj_scan_token(IDENTIFIER)) return true; + if (jj_scan_token(25)) return true; + Token xsp; + xsp = jj_scanpos; + if (jj_3R_61()) jj_scanpos = xsp; + if (jj_scan_token(26)) return true; return false; } @@ -2668,147 +2664,101 @@ private boolean jj_3R_101() { return false; } - private boolean jj_3R_61() { - if (jj_3R_31()) return true; - Token xsp; - while (true) { - xsp = jj_scanpos; - if (jj_3R_74()) { jj_scanpos = xsp; break; } - } + private boolean jj_3R_100() { + if (jj_3R_108()) return true; return false; } - private boolean jj_3R_97() { + private boolean jj_3R_99() { + if (jj_3R_107()) return true; + return false; + } + + private boolean jj_3R_96() { Token xsp; xsp = jj_scanpos; + if (jj_3R_99()) { + jj_scanpos = xsp; + if (jj_3R_100()) { + jj_scanpos = xsp; if (jj_3R_101()) { jj_scanpos = xsp; if (jj_3R_102()) { jj_scanpos = xsp; if (jj_3R_103()) { jj_scanpos = xsp; - if (jj_3R_104()) { - jj_scanpos = xsp; - if (jj_3R_105()) { - jj_scanpos = xsp; - if (jj_3R_106()) return true; + if (jj_3R_104()) return true; } } } } } xsp = jj_scanpos; - if (jj_3R_117()) jj_scanpos = xsp; + if (jj_3R_115()) jj_scanpos = xsp; return false; } - private boolean jj_3R_120() { - if (jj_scan_token(43)) return true; + private boolean jj_3R_30() { + if (jj_scan_token(31)) return true; + if (jj_3R_49()) return true; + if (jj_scan_token(32)) return true; return false; } - private boolean jj_3R_87() { + private boolean jj_3R_86() { Token xsp; - if (jj_3R_97()) return true; + if (jj_3R_96()) return true; while (true) { xsp = jj_scanpos; - if (jj_3R_97()) { jj_scanpos = xsp; break; } + if (jj_3R_96()) { jj_scanpos = xsp; break; } } xsp = jj_scanpos; - if (jj_3R_115()) jj_scanpos = xsp; + if (jj_3R_113()) jj_scanpos = xsp; return false; } - private boolean jj_3R_40() { - if (jj_scan_token(IDENTIFIER)) return true; - if (jj_scan_token(25)) return true; - Token xsp; - xsp = jj_scanpos; - if (jj_3R_61()) jj_scanpos = xsp; - if (jj_scan_token(26)) return true; - return false; - } - - private boolean jj_3R_127() { - if (jj_scan_token(22)) return true; - if (jj_scan_token(NONNEGINT)) return true; - if (jj_scan_token(33)) return true; + private boolean jj_3_13() { + if (jj_3R_29()) return true; return false; } - private boolean jj_3_35() { + private boolean jj_3R_124() { if (jj_scan_token(22)) return true; if (jj_scan_token(NONNEGINT)) return true; if (jj_scan_token(33)) return true; - if (jj_scan_token(24)) return true; return false; } - private boolean jj_3R_30() { - if (jj_scan_token(31)) return true; - if (jj_3R_49()) return true; - if (jj_scan_token(32)) return true; + private boolean jj_3_12() { + if (jj_3R_28()) return true; return false; } - private boolean jj_3R_133() { - if (jj_3R_134()) return true; + private boolean jj_3_11() { + if (jj_3R_27()) return true; return false; } - private boolean jj_3R_119() { - Token xsp; - xsp = jj_scanpos; - if (jj_3_34()) { - jj_scanpos = xsp; - if (jj_3_35()) { - jj_scanpos = xsp; - if (jj_3R_127()) return true; - } - } + private boolean jj_3_10() { + if (jj_3R_26()) return true; return false; } - private boolean jj_3_34() { + private boolean jj_3_35() { if (jj_scan_token(22)) return true; if (jj_scan_token(NONNEGINT)) return true; + if (jj_scan_token(33)) return true; if (jj_scan_token(24)) return true; return false; } - private boolean jj_3R_111() { - if (jj_scan_token(52)) return true; - Token xsp; - xsp = jj_scanpos; - if (jj_3R_119()) jj_scanpos = xsp; - xsp = jj_scanpos; - if (jj_3R_120()) jj_scanpos = xsp; - if (jj_3R_110()) return true; - return false; - } - - private boolean jj_3_13() { - if (jj_3R_29()) return true; - return false; - } - - private boolean jj_3_12() { - if (jj_3R_28()) return true; - return false; - } - - private boolean jj_3_11() { - if (jj_3R_27()) return true; - return false; - } - - private boolean jj_3_10() { - if (jj_3R_26()) return true; + private boolean jj_3_9() { + if (jj_3R_25()) return true; return false; } - private boolean jj_3_9() { - if (jj_3R_25()) return true; + private boolean jj_3R_130() { + if (jj_3R_131()) return true; return false; } @@ -2817,36 +2767,23 @@ private boolean jj_3_8() { return false; } - private boolean jj_3R_110() { + private boolean jj_3R_117() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(18)) { - jj_scanpos = xsp; - if (jj_scan_token(17)) { - jj_scanpos = xsp; - if (jj_scan_token(7)) { - jj_scanpos = xsp; - if (jj_scan_token(13)) { - jj_scanpos = xsp; - if (jj_scan_token(14)) { - jj_scanpos = xsp; - if (jj_scan_token(15)) { + if (jj_3_34()) { jj_scanpos = xsp; - if (jj_scan_token(16)) { + if (jj_3_35()) { jj_scanpos = xsp; - if (jj_scan_token(21)) return true; - } - } - } - } - } + if (jj_3R_124()) return true; } } return false; } - private boolean jj_3R_98() { - if (jj_scan_token(41)) return true; + private boolean jj_3_34() { + if (jj_scan_token(22)) return true; + if (jj_scan_token(NONNEGINT)) return true; + if (jj_scan_token(24)) return true; return false; } @@ -2872,23 +2809,17 @@ private boolean jj_3R_31() { return false; } - private boolean jj_3R_70() { - if (jj_scan_token(40)) return true; - return false; - } - - private boolean jj_3R_132() { - if (jj_3R_135()) return true; + private boolean jj_3R_109() { + if (jj_scan_token(50)) return true; + Token xsp; + xsp = jj_scanpos; + if (jj_3R_117()) jj_scanpos = xsp; + if (jj_3R_108()) return true; return false; } private boolean jj_3R_59() { - Token xsp; - xsp = jj_scanpos; - if (jj_3R_70()) jj_scanpos = xsp; - if (jj_3R_71()) return true; - xsp = jj_scanpos; - if (jj_3R_98()) jj_scanpos = xsp; + if (jj_3R_70()) return true; return false; } @@ -2897,30 +2828,12 @@ private boolean jj_3R_38() { return false; } - private boolean jj_3R_58() { - if (jj_scan_token(NUMCMP)) return true; - Token xsp; - xsp = jj_scanpos; - if (jj_3R_132()) { - jj_scanpos = xsp; - if (jj_3R_133()) return true; - } - return false; - } - - private boolean jj_3R_57() { - if (jj_scan_token(51)) return true; - if (jj_scan_token(IDENTIFIER)) return true; - return false; - } - private boolean jj_3R_23() { if (jj_scan_token(REGEX)) return true; return false; } - private boolean jj_3R_56() { - if (jj_scan_token(34)) return true; + private boolean jj_3R_108() { Token xsp; xsp = jj_scanpos; if (jj_scan_token(18)) { @@ -2929,7 +2842,19 @@ private boolean jj_3R_56() { jj_scanpos = xsp; if (jj_scan_token(7)) { jj_scanpos = xsp; - if (jj_3R_131()) return true; + if (jj_scan_token(13)) { + jj_scanpos = xsp; + if (jj_scan_token(14)) { + jj_scanpos = xsp; + if (jj_scan_token(15)) { + jj_scanpos = xsp; + if (jj_scan_token(16)) { + jj_scanpos = xsp; + if (jj_scan_token(21)) return true; + } + } + } + } } } } @@ -2943,24 +2868,15 @@ private boolean jj_3R_21() { return false; } - private boolean jj_3R_48() { - if (jj_3R_68()) return true; - if (jj_scan_token(23)) return true; - if (jj_3R_31()) return true; + private boolean jj_3R_129() { + if (jj_3R_132()) return true; return false; - } - - private boolean jj_3R_37() { - if (jj_scan_token(IDENTIFIER)) return true; - Token xsp; - xsp = jj_scanpos; - if (jj_3R_56()) { - jj_scanpos = xsp; - if (jj_3R_57()) { - jj_scanpos = xsp; - if (jj_3R_58()) return true; - } - } + } + + private boolean jj_3R_48() { + if (jj_3R_68()) return true; + if (jj_scan_token(23)) return true; + if (jj_3R_31()) return true; return false; } @@ -2971,6 +2887,17 @@ private boolean jj_3_7() { return false; } + private boolean jj_3R_58() { + if (jj_scan_token(NUMCMP)) return true; + Token xsp; + xsp = jj_scanpos; + if (jj_3R_129()) { + jj_scanpos = xsp; + if (jj_3R_130()) return true; + } + return false; + } + private boolean jj_3_6() { if (jj_scan_token(22)) return true; if (jj_scan_token(27)) return true; @@ -2978,6 +2905,12 @@ private boolean jj_3_6() { return false; } + private boolean jj_3R_57() { + if (jj_scan_token(49)) return true; + if (jj_scan_token(IDENTIFIER)) return true; + return false; + } + private boolean jj_3R_29() { if (jj_scan_token(38)) return true; if (jj_scan_token(22)) return true; @@ -2990,6 +2923,23 @@ private boolean jj_3R_29() { return false; } + private boolean jj_3R_56() { + if (jj_scan_token(34)) return true; + Token xsp; + xsp = jj_scanpos; + if (jj_scan_token(18)) { + jj_scanpos = xsp; + if (jj_scan_token(17)) { + jj_scanpos = xsp; + if (jj_scan_token(7)) { + jj_scanpos = xsp; + if (jj_3R_128()) return true; + } + } + } + return false; + } + private boolean jj_3_5() { if (jj_scan_token(22)) return true; if (jj_scan_token(25)) return true; @@ -2997,11 +2947,6 @@ private boolean jj_3_5() { return false; } - private boolean jj_3R_100() { - if (jj_3R_108()) return true; - return false; - } - private boolean jj_3R_20() { Token xsp; xsp = jj_scanpos; @@ -3025,40 +2970,47 @@ private boolean jj_3_4() { return false; } - private boolean jj_3R_94() { + private boolean jj_3R_37() { + if (jj_scan_token(IDENTIFIER)) return true; Token xsp; xsp = jj_scanpos; - if (jj_3R_100()) { + if (jj_3R_56()) { jj_scanpos = xsp; - if (jj_scan_token(17)) return true; + if (jj_3R_57()) { + jj_scanpos = xsp; + if (jj_3R_58()) return true; + } } return false; } - private boolean jj_3_33() { - if (jj_3R_37()) return true; + private boolean jj_3_3() { + if (jj_3R_22()) return true; return false; } - private boolean jj_3R_86() { - if (jj_scan_token(49)) return true; - if (jj_3R_96()) return true; - if (jj_scan_token(50)) return true; + private boolean jj_3R_98() { + if (jj_3R_106()) return true; return false; } - private boolean jj_3R_85() { - if (jj_3R_95()) return true; + private boolean jj_3_2() { + if (jj_3R_21()) return true; return false; } - private boolean jj_3_3() { - if (jj_3R_22()) return true; + private boolean jj_3R_93() { + Token xsp; + xsp = jj_scanpos; + if (jj_3R_98()) { + jj_scanpos = xsp; + if (jj_scan_token(17)) return true; + } return false; } - private boolean jj_3_2() { - if (jj_3R_21()) return true; + private boolean jj_3_33() { + if (jj_3R_37()) return true; return false; } @@ -3067,55 +3019,89 @@ private boolean jj_3_1() { return false; } - private boolean jj_3R_93() { - if (jj_scan_token(22)) return true; - if (jj_3R_37()) return true; + private boolean jj_3R_85() { + if (jj_scan_token(47)) return true; + if (jj_3R_95()) return true; + if (jj_scan_token(48)) return true; return false; } - private boolean jj_3R_83() { + private boolean jj_3R_82() { Token xsp; xsp = jj_scanpos; if (jj_scan_token(36)) { jj_scanpos = xsp; if (jj_scan_token(37)) return true; } - if (jj_3R_82()) return true; + if (jj_3R_81()) return true; + return false; + } + + private boolean jj_3R_84() { + if (jj_3R_94()) return true; + return false; + } + + private boolean jj_3R_68() { + if (jj_3R_81()) return true; + Token xsp; + while (true) { + xsp = jj_scanpos; + if (jj_3R_82()) { jj_scanpos = xsp; break; } + } + return false; + } + + private boolean jj_3R_92() { + if (jj_scan_token(22)) return true; + if (jj_3R_37()) return true; return false; } private boolean jj_3R_69() { Token xsp; xsp = jj_scanpos; - if (jj_3R_84()) { + if (jj_3R_83()) { jj_scanpos = xsp; - if (jj_3R_85()) { + if (jj_3R_84()) { jj_scanpos = xsp; - if (jj_3R_86()) return true; + if (jj_3R_85()) return true; } } return false; } - private boolean jj_3R_84() { + private boolean jj_3R_83() { Token xsp; xsp = jj_scanpos; - if (jj_3R_93()) { + if (jj_3R_92()) { jj_scanpos = xsp; if (jj_3_33()) { jj_scanpos = xsp; - if (jj_3R_94()) return true; + if (jj_3R_93()) return true; } } return false; } - private boolean jj_3R_68() { - if (jj_3R_82()) return true; + private boolean jj_3R_91() { + if (jj_scan_token(25)) return true; + if (jj_3R_68()) return true; + if (jj_scan_token(26)) return true; + return false; + } + + private boolean jj_3R_90() { + if (jj_3R_97()) return true; + return false; + } + + private boolean jj_3R_81() { Token xsp; - while (true) { - xsp = jj_scanpos; - if (jj_3R_83()) { jj_scanpos = xsp; break; } + xsp = jj_scanpos; + if (jj_3R_90()) { + jj_scanpos = xsp; + if (jj_3R_91()) return true; } return false; } @@ -3125,8 +3111,13 @@ private boolean jj_3R_54() { return false; } + private boolean jj_3_26() { + if (jj_3R_24()) return true; + return false; + } + private boolean jj_3R_53() { - if (jj_scan_token(48)) return true; + if (jj_scan_token(46)) return true; if (jj_3R_69()) return true; return false; } @@ -3141,37 +3132,34 @@ private boolean jj_3R_35() { return false; } - private boolean jj_3R_92() { - if (jj_scan_token(25)) return true; - if (jj_3R_68()) return true; - if (jj_scan_token(26)) return true; + private boolean jj_3R_34() { + if (jj_3R_52()) return true; + if (jj_3R_31()) return true; return false; } - private boolean jj_3R_91() { - if (jj_3R_99()) return true; + private boolean jj_3_32() { + if (jj_scan_token(46)) return true; + if (jj_scan_token(25)) return true; + if (jj_3R_36()) return true; return false; } - private boolean jj_3R_82() { + private boolean jj_3_25() { + if (jj_3R_25()) return true; Token xsp; xsp = jj_scanpos; - if (jj_3R_91()) { - jj_scanpos = xsp; - if (jj_3R_92()) return true; - } - return false; - } - - private boolean jj_3_32() { - if (jj_scan_token(48)) return true; - if (jj_scan_token(25)) return true; - if (jj_3R_36()) return true; + if (jj_3R_34()) jj_scanpos = xsp; return false; } - private boolean jj_3_26() { - if (jj_3R_24()) return true; + private boolean jj_3R_97() { + Token xsp; + xsp = jj_scanpos; + if (jj_3_25()) { + jj_scanpos = xsp; + if (jj_3_26()) return true; + } return false; } @@ -3187,12 +3175,6 @@ private boolean jj_3_30() { return false; } - private boolean jj_3R_34() { - if (jj_3R_52()) return true; - if (jj_3R_31()) return true; - return false; - } - private boolean jj_3R_55() { Token xsp; xsp = jj_scanpos; @@ -3206,36 +3188,36 @@ private boolean jj_3R_55() { return false; } - private boolean jj_3_25() { - if (jj_3R_25()) return true; - Token xsp; - xsp = jj_scanpos; - if (jj_3R_34()) jj_scanpos = xsp; + private boolean jj_3R_47() { + if (jj_scan_token(33)) return true; + if (jj_3R_31()) return true; return false; } - private boolean jj_3R_99() { + private boolean jj_3R_28() { + if (jj_scan_token(31)) return true; + if (jj_3R_31()) return true; Token xsp; - xsp = jj_scanpos; - if (jj_3_25()) { - jj_scanpos = xsp; - if (jj_3_26()) return true; + while (true) { + xsp = jj_scanpos; + if (jj_3R_47()) { jj_scanpos = xsp; break; } } + if (jj_scan_token(32)) return true; return false; } - private boolean jj_3R_47() { + private boolean jj_3R_46() { if (jj_scan_token(33)) return true; if (jj_3R_31()) return true; return false; } - private boolean jj_3R_130() { + private boolean jj_3R_127() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(46)) { + if (jj_scan_token(44)) { jj_scanpos = xsp; - if (jj_scan_token(45)) { + if (jj_scan_token(43)) { jj_scanpos = xsp; if (jj_scan_token(36)) { jj_scanpos = xsp; @@ -3247,15 +3229,15 @@ private boolean jj_3R_130() { return false; } - private boolean jj_3R_28() { - if (jj_scan_token(31)) return true; + private boolean jj_3R_27() { + if (jj_scan_token(25)) return true; if (jj_3R_31()) return true; Token xsp; while (true) { xsp = jj_scanpos; - if (jj_3R_47()) { jj_scanpos = xsp; break; } + if (jj_3R_46()) { jj_scanpos = xsp; break; } } - if (jj_scan_token(32)) return true; + if (jj_scan_token(26)) return true; return false; } @@ -3264,26 +3246,8 @@ private boolean jj_3R_36() { Token xsp; while (true) { xsp = jj_scanpos; - if (jj_3R_130()) { jj_scanpos = xsp; break; } - } - return false; - } - - private boolean jj_3R_46() { - if (jj_scan_token(33)) return true; - if (jj_3R_31()) return true; - return false; - } - - private boolean jj_3R_27() { - if (jj_scan_token(25)) return true; - if (jj_3R_31()) return true; - Token xsp; - while (true) { - xsp = jj_scanpos; - if (jj_3R_46()) { jj_scanpos = xsp; break; } + if (jj_3R_127()) { jj_scanpos = xsp; break; } } - if (jj_scan_token(26)) return true; return false; } @@ -3343,11 +3307,6 @@ private boolean jj_3_20() { return false; } - private boolean jj_3R_114() { - if (jj_scan_token(BACKREF)) return true; - return false; - } - private boolean jj_3_19() { if (jj_scan_token(35)) return true; if (jj_3R_33()) return true; @@ -3382,8 +3341,8 @@ private boolean jj_3_17() { return false; } - private boolean jj_3R_113() { - if (jj_scan_token(REGEXVAR)) return true; + private boolean jj_3R_112() { + if (jj_scan_token(BACKREF)) return true; return false; } @@ -3397,32 +3356,14 @@ private boolean jj_3R_25() { return false; } - private boolean jj_3R_116() { - if (jj_scan_token(31)) return true; - if (jj_3R_36()) return true; - if (jj_scan_token(32)) return true; - return false; - } - - private boolean jj_3_29() { - if (jj_scan_token(31)) return true; - if (jj_scan_token(32)) return true; - return false; - } - private boolean jj_3_16() { if (jj_scan_token(35)) return true; if (jj_3R_32()) return true; return false; } - private boolean jj_3R_109() { - Token xsp; - xsp = jj_scanpos; - if (jj_3_29()) { - jj_scanpos = xsp; - if (jj_3R_116()) return true; - } + private boolean jj_3R_111() { + if (jj_scan_token(REGEXVAR)) return true; return false; } @@ -3443,7 +3384,7 @@ private boolean jj_3_15() { private Token jj_scanpos, jj_lastpos; private int jj_la; private int jj_gen; - final private int[] jj_la1 = new int[71]; + final private int[] jj_la1 = new int[68]; static private int[] jj_la1_0; static private int[] jj_la1_1; static { @@ -3451,10 +3392,10 @@ private boolean jj_3_15() { jj_la1_init_1(); } private static void jj_la1_init_0() { - jj_la1_0 = new int[] {0x400180,0x40000000,0x0,0x8247ef80,0x247e000,0x40000000,0x40000000,0x207e000,0x180,0xf80,0x0,0x8247ef80,0x80000000,0x80000000,0x80000000,0x0,0x0,0x180000,0x2000f80,0x0,0x0,0x40000000,0x2000f80,0x0,0x0,0x0,0x5e000,0x8227f180,0x400000,0x8227f180,0x800000,0x0,0x400000,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x80000000,0x82460180,0x0,0x0,0x0,0x0,0x0,0x0,0x460180,0x40000000,0x40000000,0x60080,0x400000,0x60080,0x460180,0x60180,0x16100,0x80000,0x27e080,0x400000,0x400000,0x0,0x0,0x40000000,0x40000000,0x56080,0x16000,0x6000,0x180000,0x40080,0x40080,}; + jj_la1_0 = new int[] {0x400180,0x40000000,0x0,0x8247ef80,0x247e000,0x40000000,0x40000000,0x207e000,0x180,0xf80,0x0,0x8247ef80,0x80000000,0x80000000,0x80000000,0x0,0x0,0x180000,0x2000f80,0x0,0x0,0x40000000,0x2000f80,0x0,0x5e000,0x8227f180,0x400000,0x8227f180,0x800000,0x0,0x400000,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x80000000,0x82460180,0x0,0x0,0x0,0x0,0x0,0x0,0x460180,0x40000000,0x40000000,0x60080,0x400000,0x60080,0x460180,0x60180,0x16100,0x80000,0x27e080,0x400000,0x400000,0x0,0x40000000,0x40000000,0x56080,0x16000,0x6000,0x180000,0x40080,0x40080,}; } private static void jj_la1_init_1() { - jj_la1_1 = new int[] {0x0,0x0,0x2,0x40,0x0,0x2,0x2,0x0,0x0,0x0,0x2,0x40,0x8,0x8,0x8,0x2,0x2,0x0,0x0,0x30,0x30,0x2,0x0,0x80,0x100,0x200,0x0,0x100000,0x1c00,0x100000,0x0,0x1c00,0x0,0x800,0x2020,0x2020,0x6030,0x6030,0x8800,0x8800,0x0,0x30000,0x2020,0x2020,0x4010,0x4010,0x6030,0x6030,0x30000,0x2,0x2,0x0,0x0,0x0,0x20000,0x0,0x0,0x80004,0x0,0x0,0x0,0x800,0x200000,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,}; + jj_la1_1 = new int[] {0x0,0x0,0x2,0x40,0x0,0x2,0x2,0x0,0x0,0x0,0x2,0x40,0x8,0x8,0x8,0x2,0x2,0x0,0x0,0x30,0x30,0x2,0x0,0x80,0x0,0x40000,0x700,0x40000,0x0,0x700,0x0,0x200,0x820,0x820,0x1830,0x1830,0x2200,0x2200,0x0,0xc000,0x820,0x820,0x1010,0x1010,0x1830,0x1830,0xc000,0x2,0x2,0x0,0x0,0x0,0x8000,0x0,0x0,0x20004,0x0,0x0,0x0,0x80000,0x2,0x2,0x0,0x0,0x0,0x0,0x0,0x0,}; } final private JJCalls[] jj_2_rtns = new JJCalls[35]; private boolean jj_rescan = false; @@ -3471,7 +3412,7 @@ public TokenSequenceParser(java.io.InputStream stream, String encoding) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 71; i++) jj_la1[i] = -1; + for (int i = 0; i < 68; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } @@ -3486,7 +3427,7 @@ public void ReInit(java.io.InputStream stream, String encoding) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 71; i++) jj_la1[i] = -1; + for (int i = 0; i < 68; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } @@ -3497,7 +3438,7 @@ public TokenSequenceParser(java.io.Reader stream) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 71; i++) jj_la1[i] = -1; + for (int i = 0; i < 68; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } @@ -3508,7 +3449,7 @@ public void ReInit(java.io.Reader stream) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 71; i++) jj_la1[i] = -1; + for (int i = 0; i < 68; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } @@ -3518,7 +3459,7 @@ public TokenSequenceParser(TokenSequenceParserTokenManager tm) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 71; i++) jj_la1[i] = -1; + for (int i = 0; i < 68; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } @@ -3528,7 +3469,7 @@ public void ReInit(TokenSequenceParserTokenManager tm) { token = new Token(); jj_ntk = -1; jj_gen = 0; - for (int i = 0; i < 71; i++) jj_la1[i] = -1; + for (int i = 0; i < 68; i++) jj_la1[i] = -1; for (int i = 0; i < jj_2_rtns.length; i++) jj_2_rtns[i] = new JJCalls(); } @@ -3640,12 +3581,12 @@ private void jj_add_error_token(int kind, int pos) { /** Generate ParseException. */ public ParseException generateParseException() { jj_expentries.clear(); - boolean[] la1tokens = new boolean[55]; + boolean[] la1tokens = new boolean[53]; if (jj_kind >= 0) { la1tokens[jj_kind] = true; jj_kind = -1; } - for (int i = 0; i < 71; i++) { + for (int i = 0; i < 68; i++) { if (jj_la1[i] == jj_gen) { for (int j = 0; j < 32; j++) { if ((jj_la1_0[i] & (1< 9) - kind = 9; - jjCheckNAdd(36); - } - else if (curChar == 36) - jjCheckNAdd(42); - if (curChar == 36) - jjstateSet[jjnewStateCnt++] = 39; - break; case 50: case 28: if (curChar == 61 && kind > 19) @@ -755,14 +737,6 @@ else if (curChar == 92) if (curChar == 69) jjCheckNAddTwoStates(15, 16); break; - case 34: - case 35: - if ((0x7fffffe87fffffeL & l) == 0L) - break; - if (kind > 8) - kind = 8; - jjCheckNAdd(35); - break; case 50: if (curChar == 126 && kind > 20) kind = 20; @@ -824,6 +798,14 @@ else if (curChar == 92) kind = 21; jjCheckNAdd(32); break; + case 34: + case 35: + if ((0x7fffffe87fffffeL & l) == 0L) + break; + if (kind > 8) + kind = 8; + jjCheckNAdd(35); + break; case 39: case 40: if ((0x7fffffe87fffffeL & l) == 0L) @@ -842,7 +824,7 @@ else if (curChar == 92) } else { - int hiByte = ((int) curChar) >> 8; + int hiByte = (int)(curChar >> 8); int i1 = hiByte >> 6; long l1 = 1L << (hiByte & 077); int i2 = (curChar & 0xff) >> 6; @@ -907,16 +889,15 @@ private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, lo null, null, null, null, null, null, null, null, null, "\173", "\75\76", "\175", "\50", "\51", "\164\157\153\145\156\163\72", "\164\145\170\164\72", "\75", "\73", "\133", "\135", "\54", "\72", "\56", "\46\46", "\174\174", "\72\143\141\163\145", -"\72\145\154\163\145", "\136", "\44", "\52", "\77", "\53", "\174", "\46", "\77\72", "\41", -"\173\173", "\175\175", "\72\72", "\50\77\155\51", "\75\75\76", -"\46\141\156\156\157\164\141\164\145", }; +"\72\145\154\163\145", "\52", "\77", "\53", "\174", "\46", "\77\72", "\41", "\173\173", "\175\175", +"\72\72", "\50\77\155\51", "\75\75\76", "\46\141\156\156\157\164\141\164\145", }; /** Lexer state names. */ public static final String[] lexStateNames = { "DEFAULT", }; static final long[] jjtoToken = { - 0x7fffffffffff81L, + 0x1fffffffffff81L, }; static final long[] jjtoSkip = { 0x7eL, diff --git a/src/edu/stanford/nlp/loglinear/inference/CliqueTree.java b/src/edu/stanford/nlp/loglinear/inference/CliqueTree.java index a76d80ca01..fb39dccfd8 100644 --- a/src/edu/stanford/nlp/loglinear/inference/CliqueTree.java +++ b/src/edu/stanford/nlp/loglinear/inference/CliqueTree.java @@ -7,808 +7,808 @@ /** * Created by keenon on 8/11/15. - *

+ * * This is instantiated once per model, so that it can keep caches of important stuff like messages and * local factors during many game playing sample steps. It assumes that the model that is passed in is by-reference, * and that it can change between inference calls in small ways, so that cacheing of some results is worthwhile. */ public class CliqueTree { - GraphicalModel model; - ConcatVector weights; - - // This is the metadata key for the model to store an observed value for a variable, as an int - public static final String VARIABLE_OBSERVED_VALUE = "inference.CliqueTree.VARIABLE_OBSERVED_VALUE"; - - private static final boolean CACHE_MESSAGES = true; - - /** - * Create an Inference object for a given set of weights, and a model. - *

- * The object is around to facilitate cacheing as an eventual optimization, when models are changing in minor ways - * and inference is required several times. Work is done lazily, so is left until actual inference is requested. - * - * @param model the model to be computed over, subject to change in the future - * @param weights the weights to dot product with model features to get log-linear factors, is cloned internally so - * that no changes to the weights vector will be reflected by the CliqueTree. If you want to change - * the weights, you must create a new CliqueTree. - */ - public CliqueTree(GraphicalModel model, ConcatVector weights) { - this.model = model; - this.weights = weights.deepClone(); - } - - /** - * Little data structure for passing around the results of marginal computations. - */ - public static class MarginalResult { - public double[][] marginals; - public double partitionFunction; - public Map jointMarginals; - - public MarginalResult(double[][] marginals, double partitionFunction, Map jointMarginals) { - this.marginals = marginals; - this.partitionFunction = partitionFunction; - this.jointMarginals = jointMarginals; + GraphicalModel model; + ConcatVector weights; + + // This is the metadata key for the model to store an observed value for a variable, as an int + public static final String VARIABLE_OBSERVED_VALUE = "inference.CliqueTree.VARIABLE_OBSERVED_VALUE"; + + private static final boolean CACHE_MESSAGES = true; + + /** + * Create an Inference object for a given set of weights, and a model. + *

+ * The object is around to facilitate cacheing as an eventual optimization, when models are changing in minor ways + * and inference is required several times. Work is done lazily, so is left until actual inference is requested. + * + * @param model the model to be computed over, subject to change in the future + * @param weights the weights to dot product with model features to get log-linear factors, is cloned internally so + * that no changes to the weights vector will be reflected by the CliqueTree. If you want to change + * the weights, you must create a new CliqueTree. + */ + public CliqueTree(GraphicalModel model, ConcatVector weights) { + this.model = model; + this.weights = weights.deepClone(); } - } - - /** - * This assumes that factors represent joint probabilities. - * - * @return global marginals - */ - public MarginalResult calculateMarginals() { - return messagePassing(MarginalizationMethod.SUM, true); - } - - /** - * This will calculate marginals, but skip the stuff that is created for gradient descent: joint marginals and - * partition functions. This makes it much faster. It is thus appropriate for gameplayer style work, where many - * samples need to be drawn with the same marginals. - * - * @return an array, indexed first by variable, then by variable assignment, of global probability - */ - public double[][] calculateMarginalsJustSingletons() { - MarginalResult result = messagePassing(MarginalizationMethod.SUM, false); - return result.marginals; - } - - /** - * This assumes that factors represent joint probabilities. - * - * @return an array, indexed by variable, of maximum likelihood assignments - */ - public int[] calculateMAP() { - double[][] mapMarginals = messagePassing(MarginalizationMethod.MAX, false).marginals; - int[] result = new int[mapMarginals.length]; - for (int i = 0; i < result.length; i++) { - if (mapMarginals[i] != null) { - for (int j = 0; j < mapMarginals[i].length; j++) { - if (mapMarginals[i][j] > mapMarginals[i][result[i]]) { - result[i] = j; - } - } - } - // If there is no factor touching an observed variable, the resulting MAP won't reference the variable - // observation since message passing won't touch the variable index - if (model.getVariableMetaDataByReference(i).containsKey(VARIABLE_OBSERVED_VALUE)) { - result[i] = Integer.parseInt(model.getVariableMetaDataByReference(i).get(VARIABLE_OBSERVED_VALUE)); - } - } - return result; - } - - //////////////////////////////////////////////////////////////////////////// - // PRIVATE IMPLEMENTATION - //////////////////////////////////////////////////////////////////////////// - - private enum MarginalizationMethod { - SUM, - MAX - } - - // OPTIMIZATION: - // cache the creation of TableFactors, to avoid redundant dot products - - private IdentityHashMap cachedFactors = new IdentityHashMap<>(); - - private static class CachedFactorWithObservations { - TableFactor cachedFactor; - int[] observations; - boolean impossibleObservation; - } - - // OPTIMIZATION: - // cache the last list of factors, and the last set of messages passed, in case we can recycle some - - private TableFactor[] cachedCliqueList; - private TableFactor[][] cachedMessages; - private boolean[][] cachedBackwardPassedMessages; - - /** - * Does tree shaped message passing. The algorithm calls for first passing down to the leaves, then passing back up - * to the root. - * - * @param marginalize the method for marginalization, controls MAP or marginals - * @return the marginal messages - */ - private MarginalResult messagePassing(MarginalizationMethod marginalize, boolean includeJointMarginalsAndPartition) { - - // Using the behavior of brute force factor multiplication as ground truth, the desired - // outcome of marginal calculation with an impossible factor is a uniform probability dist., - // since we have a resulting factor of all 0s. That is of course assuming that normalizing - // all 0s gives you uniform, which is not real math, but that's a useful tolerance to include, so we do. - - boolean impossibleObservationMade = false; - - // Message passing will look at fully observed cliques as non-entities, but their - // log-likelihood (the log-likelihood of the single observed value) is still relevant for the - // partition function. - - double partitionFunction = 1.0; - - if (includeJointMarginalsAndPartition) { - outer: - for (GraphicalModel.Factor f : model.factors) { - for (int n : f.neigborIndices) { - if (!model.getVariableMetaDataByReference(n).containsKey(VARIABLE_OBSERVED_VALUE)) continue outer; - } - int[] assignment = new int[f.neigborIndices.length]; - for (int i = 0; i < f.neigborIndices.length; i++) { - assignment[i] = Integer.parseInt(model.getVariableMetaDataByReference(f.neigborIndices[i]).get(VARIABLE_OBSERVED_VALUE)); - } + /** + * Little data structure for passing around the results of marginal computations. + */ + public static class MarginalResult { + public double[][] marginals; + public double partitionFunction; + public Map jointMarginals; - double assignmentValue = f.featuresTable.getAssignmentValue(assignment).get().dotProduct(weights); - if (Double.isInfinite(assignmentValue)) { - impossibleObservationMade = true; - } else { - partitionFunction *= Math.exp(assignmentValue); + public MarginalResult(double[][] marginals, double partitionFunction, Map jointMarginals) { + this.marginals = marginals; + this.partitionFunction = partitionFunction; + this.jointMarginals = jointMarginals; } - } } - // Create the cliques by multiplying out table factors - // TODO:OPT This could be made more efficient by observing first, then dot product - - List cliquesList = new ArrayList<>(); - Map cliqueToFactor = new HashMap<>(); - - int numFactorsCached = 0; - - for (GraphicalModel.Factor f : model.factors) { - boolean allObserved = true; - int maxVar = 0; - for (int n : f.neigborIndices) { - if (!model.getVariableMetaDataByReference(n).containsKey(VARIABLE_OBSERVED_VALUE)) allObserved = false; - if (n > maxVar) maxVar = n; - } - if (allObserved) continue; - - TableFactor clique = null; - - // Retrieve cache if exists and none of the observations have changed - - if (cachedFactors.containsKey(f)) { - CachedFactorWithObservations obs = cachedFactors.get(f); - boolean allConsistent = true; - for (int i = 0; i < f.neigborIndices.length; i++) { - int n = f.neigborIndices[i]; - if (model.getVariableMetaDataByReference(n).containsKey(VARIABLE_OBSERVED_VALUE) && - (obs.observations[i] == -1 || - Integer.parseInt(model.getVariableMetaDataByReference(n).get(VARIABLE_OBSERVED_VALUE)) != obs.observations[i])) { - allConsistent = false; - break; - } - // NOTE: This disqualifies lots of stuff for some reason... - if (!model.getVariableMetaDataByReference(n).containsKey(VARIABLE_OBSERVED_VALUE) && (obs.observations[i] != -1)) { - allConsistent = false; - break; - } - } - if (allConsistent) { - clique = obs.cachedFactor; - numFactorsCached++; - if (obs.impossibleObservation) { - impossibleObservationMade = true; - } - } - } - - // Otherwise make a new cache - - if (clique == null) { - int[] observations = new int[f.neigborIndices.length]; - for (int i = 0; i < observations.length; i++) { - Map metadata = model.getVariableMetaDataByReference(f.neigborIndices[i]); - if (metadata.containsKey(VARIABLE_OBSERVED_VALUE)) { - int value = Integer.parseInt(metadata.get(VARIABLE_OBSERVED_VALUE)); - observations[i] = value; - } else { - observations[i] = -1; - } - } - - clique = new TableFactor(weights, f, observations); + /** + * This assumes that factors represent joint probabilities. + * + * @return global marginals + */ + public MarginalResult calculateMarginals() { + return messagePassing(MarginalizationMethod.SUM, true); + } - CachedFactorWithObservations cache = new CachedFactorWithObservations(); - cache.cachedFactor = clique; - cache.observations = observations; + /** + * This will calculate marginals, but skip the stuff that is created for gradient descent: joint marginals and + * partition functions. This makes it much faster. It is thus appropriate for gameplayer style work, where many + * samples need to be drawn with the same marginals. + * + * @return an array, indexed first by variable, then by variable assignment, of global probability + */ + public double[][] calculateMarginalsJustSingletons() { + MarginalResult result = messagePassing(MarginalizationMethod.SUM, false); + return result.marginals; + } - // Check for an impossible observation - boolean nonZeroValue = false; - for (int[] assignment : clique) { - if (clique.getAssignmentValue(assignment) > 0) { - nonZeroValue = true; - break; - } - } - if (!nonZeroValue) { - impossibleObservationMade = true; - cache.impossibleObservation = true; + /** + * This assumes that factors represent joint probabilities. + * + * @return an array, indexed by variable, of maximum likelihood assignments + */ + public int[] calculateMAP() { + double[][] mapMarginals = messagePassing(MarginalizationMethod.MAX, false).marginals; + int[] result = new int[mapMarginals.length]; + for (int i = 0; i < result.length; i++) { + if (mapMarginals[i] != null) { + for (int j = 0; j < mapMarginals[i].length; j++) { + if (mapMarginals[i][j] > mapMarginals[i][result[i]]) { + result[i] = j; + } + } + } + // If there is no factor touching an observed variable, the resulting MAP won't reference the variable + // observation since message passing won't touch the variable index + if (model.getVariableMetaDataByReference(i).containsKey(VARIABLE_OBSERVED_VALUE)) { + result[i] = Integer.parseInt(model.getVariableMetaDataByReference(i).get(VARIABLE_OBSERVED_VALUE)); + } } + return result; + } - cachedFactors.put(f, cache); - } + //////////////////////////////////////////////////////////////////////////// + // PRIVATE IMPLEMENTATION + //////////////////////////////////////////////////////////////////////////// - cliqueToFactor.put(cliquesList.size(), f); - cliquesList.add(clique); + private enum MarginalizationMethod { + SUM, + MAX } - TableFactor[] cliques = cliquesList.toArray(new TableFactor[cliquesList.size()]); + // OPTIMIZATION: + // cache the creation of TableFactors, to avoid redundant dot products - // If we made any impossible observations, we can just return a uniform distribution for all the variables that - // weren't observed, since that's the semantically correct thing to do (our 'probability' is broken at this - // point). + private IdentityHashMap cachedFactors = new IdentityHashMap<>(); + private static class CachedFactorWithObservations { + TableFactor cachedFactor; + int[] observations; + boolean impossibleObservation; + } - if (impossibleObservationMade) { - int maxVar = 0; - for (TableFactor c : cliques) { - for (int i : c.neighborIndices) if (i > maxVar) maxVar = i; - } + // OPTIMIZATION: + // cache the last list of factors, and the last set of messages passed, in case we can recycle some + + private TableFactor[] cachedCliqueList; + private TableFactor[][] cachedMessages; + private boolean[][] cachedBackwardPassedMessages; + + /** + * Does tree shaped message passing. The algorithm calls for first passing down to the leaves, then passing back up + * to the root. + * + * @param marginalize the method for marginalization, controls MAP or marginals + * @return the marginal messages + */ + private MarginalResult messagePassing(MarginalizationMethod marginalize, boolean includeJointMarginalsAndPartition) { + + // Using the behavior of brute force factor multiplication as ground truth, the desired + // outcome of marginal calculation with an impossible factor is a uniform probability dist., + // since we have a resulting factor of all 0s. That is of course assuming that normalizing + // all 0s gives you uniform, which is not real math, but that's a useful tolerance to include, so we do. + + boolean impossibleObservationMade = false; + + // Message passing will look at fully observed cliques as non-entities, but their + // log-likelihood (the log-likelihood of the single observed value) is still relevant for the + // partition function. + + double partitionFunction = 1.0; + + if (includeJointMarginalsAndPartition) { + outer: + for (GraphicalModel.Factor f : model.factors) { + for (int n : f.neigborIndices) { + if (!model.getVariableMetaDataByReference(n).containsKey(VARIABLE_OBSERVED_VALUE)) continue outer; + } - double[][] result = new double[maxVar + 1][]; + int[] assignment = new int[f.neigborIndices.length]; + for (int i = 0; i < f.neigborIndices.length; i++) { + assignment[i] = Integer.parseInt(model.getVariableMetaDataByReference(f.neigborIndices[i]).get(VARIABLE_OBSERVED_VALUE)); + } - for (TableFactor c : cliques) { - for (int i = 0; i < c.neighborIndices.length; i++) { - result[c.neighborIndices[i]] = new double[c.getDimensions()[i]]; - for (int j = 0; j < result[c.neighborIndices[i]].length; j++) { - result[c.neighborIndices[i]][j] = 1.0 / result[c.neighborIndices[i]].length; - } + double assignmentValue = f.featuresTable.getAssignmentValue(assignment).get().dotProduct(weights); + if (Double.isInfinite(assignmentValue)) { + impossibleObservationMade = true; + } else { + partitionFunction *= Math.exp(assignmentValue); + } + } } - } - // Create a bunch of uniform joint marginals, constrained by observations, and fill up the joint marginals - // with them + // Create the cliques by multiplying out table factors + // TODO:OPT This could be made more efficient by observing first, then dot product + + List cliquesList = new ArrayList<>(); + Map cliqueToFactor = new HashMap<>(); + + int numFactorsCached = 0; - Map jointMarginals = new IdentityHashMap<>(); - if (includeJointMarginalsAndPartition) { for (GraphicalModel.Factor f : model.factors) { - TableFactor uniformZero = new TableFactor(f.neigborIndices, f.featuresTable.getDimensions()); + boolean allObserved = true; + int maxVar = 0; + for (int n : f.neigborIndices) { + if (!model.getVariableMetaDataByReference(n).containsKey(VARIABLE_OBSERVED_VALUE)) allObserved = false; + if (n > maxVar) maxVar = n; + } + if (allObserved) continue; + + TableFactor clique = null; + + // Retrieve cache if exists and none of the observations have changed + + if (cachedFactors.containsKey(f)) { + CachedFactorWithObservations obs = cachedFactors.get(f); + boolean allConsistent = true; + for (int i = 0; i < f.neigborIndices.length; i++) { + int n = f.neigborIndices[i]; + if (model.getVariableMetaDataByReference(n).containsKey(VARIABLE_OBSERVED_VALUE) && + (obs.observations[i] == -1 || + Integer.parseInt(model.getVariableMetaDataByReference(n).get(VARIABLE_OBSERVED_VALUE)) != obs.observations[i])) { + allConsistent = false; + break; + } + // NOTE: This disqualifies lots of stuff for some reason... + if (!model.getVariableMetaDataByReference(n).containsKey(VARIABLE_OBSERVED_VALUE) && (obs.observations[i] != -1)) { + allConsistent = false; + break; + } + } + if (allConsistent) { + clique = obs.cachedFactor; + numFactorsCached++; + if (obs.impossibleObservation) { + impossibleObservationMade = true; + } + } + } - for (int[] assignment : uniformZero) { - uniformZero.setAssignmentValue(assignment, 0.0); - } + // Otherwise make a new cache + + if (clique == null) { + int[] observations = new int[f.neigborIndices.length]; + for (int i = 0; i < observations.length; i++) { + Map metadata = model.getVariableMetaDataByReference(f.neigborIndices[i]); + if (metadata.containsKey(VARIABLE_OBSERVED_VALUE)) { + int value = Integer.parseInt(metadata.get(VARIABLE_OBSERVED_VALUE)); + observations[i] = value; + } + else { + observations[i] = -1; + } + } + + clique = new TableFactor(weights, f, observations); + + CachedFactorWithObservations cache = new CachedFactorWithObservations(); + cache.cachedFactor = clique; + cache.observations = observations; + + // Check for an impossible observation + boolean nonZeroValue = false; + for (int[] assignment : clique) { + if (clique.getAssignmentValue(assignment) > 0) { + nonZeroValue = true; + break; + } + } + if (!nonZeroValue) { + impossibleObservationMade = true; + cache.impossibleObservation = true; + } + + cachedFactors.put(f, cache); + } - jointMarginals.put(f, uniformZero); + cliqueToFactor.put(cliquesList.size(), f); + cliquesList.add(clique); } - } - return new MarginalResult(result, 1.0, jointMarginals); - } + TableFactor[] cliques = cliquesList.toArray(new TableFactor[cliquesList.size()]); - // Find the largest contained variable, so that we can size arrays appropriately + // If we made any impossible observations, we can just return a uniform distribution for all the variables that + // weren't observed, since that's the semantically correct thing to do (our 'probability' is broken at this + // point). - int maxVar = 0; - for (GraphicalModel.Factor fac : model.factors) { - for (int i : fac.neigborIndices) if (i > maxVar) maxVar = i; - } + if (impossibleObservationMade) { + int maxVar = 0; + for (TableFactor c : cliques) { + for (int i : c.neighborIndices) if (i > maxVar) maxVar = i; + } + double[][] result = new double[maxVar + 1][]; - // Indexed by (start-clique, end-clique), this array will remain mostly null in most graphs + for (TableFactor c : cliques) { + for (int i = 0; i < c.neighborIndices.length; i++) { + result[c.neighborIndices[i]] = new double[c.getDimensions()[i]]; + for (int j = 0; j < result[c.neighborIndices[i]].length; j++) { + result[c.neighborIndices[i]][j] = 1.0 / result[c.neighborIndices[i]].length; + } + } + } - TableFactor[][] messages = new TableFactor[cliques.length][cliques.length]; + // Create a bunch of uniform joint marginals, constrained by observations, and fill up the joint marginals + // with them - // OPTIMIZATION: - // check if we've only added one factor since the last time we ran marginal inference. If that's the case, we - // can use the new factor as the root, all the messages passed in from the leaves will not have changed. That - // means we can cut message passing computation in half. - - boolean[][] backwardPassedMessages = new boolean[cliques.length][cliques.length]; - - int forceRootForCachedMessagePassing = -1; - int[] cachedCliquesBackPointers = null; - if (CACHE_MESSAGES && (numFactorsCached == cliques.length - 1) && (numFactorsCached > 0)) { - cachedCliquesBackPointers = new int[cliques.length]; - - // Calculate the correspondence between the old cliques list and the new cliques list - - for (int i = 0; i < cliques.length; i++) { - cachedCliquesBackPointers[i] = -1; - for (int j = 0; j < cachedCliqueList.length; j++) { - if (cliques[i] == cachedCliqueList[j]) { - cachedCliquesBackPointers[i] = j; - break; - } - } - if (cachedCliquesBackPointers[i] == -1) { - assert (forceRootForCachedMessagePassing == -1); - forceRootForCachedMessagePassing = i; + Map jointMarginals = new IdentityHashMap<>(); + if (includeJointMarginalsAndPartition) { + for (GraphicalModel.Factor f : model.factors) { + TableFactor uniformZero = new TableFactor(f.neigborIndices, f.featuresTable.getDimensions()); + + for (int[] assignment : uniformZero) { + uniformZero.setAssignmentValue(assignment, 0.0); + } + + jointMarginals.put(f, uniformZero); + } + } + + return new MarginalResult(result, 1.0, jointMarginals); } - } - assert (forceRootForCachedMessagePassing != -1); - } - // Create the data structures to hold the tree pattern + // Find the largest contained variable, so that we can size arrays appropriately - boolean[] visited = new boolean[cliques.length]; - int numVisited = 0; - int[] visitedOrder = new int[cliques.length]; + int maxVar = 0; + for (GraphicalModel.Factor fac : model.factors) { + for (int i : fac.neigborIndices) if (i > maxVar) maxVar = i; + } - int[] parent = new int[cliques.length]; - for (int i = 0; i < parent.length; i++) parent[i] = -1; - // Figure out which cliques are connected to which trees. This is important for calculating the partition - // function later, since each tree will converge to its own partition function by multiplication, and we will - // need to multiply the partition function of each of the trees to get the global one. - int[] trees = new int[cliques.length]; - // Forward pass, record a BFS forest pattern that we can use for message passing + // Indexed by (start-clique, end-clique), this array will remain mostly null in most graphs - int treeIndex = -1; - boolean[] seenVariable = new boolean[maxVar + 1]; - while (numVisited < cliques.length) { - treeIndex++; + TableFactor[][] messages = new TableFactor[cliques.length][cliques.length]; - // Pick the largest connected graph remaining as the root for message passing + // OPTIMIZATION: + // check if we've only added one factor since the last time we ran marginal inference. If that's the case, we + // can use the new factor as the root, all the messages passed in from the leaves will not have changed. That + // means we can cut message passing computation in half. - int root = -1; + boolean[][] backwardPassedMessages = new boolean[cliques.length][cliques.length]; - // OPTIMIZATION: if there's a forced root for message passing (a node that we just added) then make it the - // root + int forceRootForCachedMessagePassing = -1; + int[] cachedCliquesBackPointers = null; + if (CACHE_MESSAGES && (numFactorsCached == cliques.length-1) && (numFactorsCached > 0)) { + cachedCliquesBackPointers = new int[cliques.length]; - if (CACHE_MESSAGES && forceRootForCachedMessagePassing != -1 && !visited[forceRootForCachedMessagePassing]) { - root = forceRootForCachedMessagePassing; - } else { - for (int i = 0; i < cliques.length; i++) { - if (!visited[i] && - (root == -1 || cliques[i].neighborIndices.length > cliques[root].neighborIndices.length)) { - root = i; - } - } - } - assert (root != -1); - - Queue toVisit = new ArrayDeque<>(); - toVisit.add(root); - boolean[] toVisitArray = new boolean[cliques.length]; - toVisitArray[root] = true; - - while (toVisit.size() > 0) { - int cursor = toVisit.poll(); - // toVisitArray[cursor] = false; - trees[cursor] = treeIndex; - if (visited[cursor]) { - System.err.println("Visited contains: " + cursor); - System.err.println("Visited: " + Arrays.toString(visited)); - System.err.println("To visit: " + toVisit); - } - assert (!visited[cursor]); - visited[cursor] = true; - visitedOrder[numVisited] = cursor; - for (int i : cliques[cursor].neighborIndices) seenVariable[i] = true; - numVisited++; - - childLoop: - for (int i = 0; i < cliques.length; i++) { - if (i == cursor) continue; - if (i == parent[cursor]) continue; - if (domainsOverlap(cliques[cursor], cliques[i])) { - - // Make sure that for every variable that we've already seen somewhere in the graph, if it's - // in the child, it's in the parent. Otherwise we'll break the property of continuous - // transmission of information about variables through messages. - - childNeighborLoop: - for (int child : cliques[i].neighborIndices) { - if (seenVariable[child]) { - for (int j : cliques[cursor].neighborIndices) { - if (j == child) { - continue childNeighborLoop; - } - } - // If we get here it means that this clique is not good as a child, since we can't pass - // it all the information it needs from other elements of the tree - continue childLoop; - } - } + // Calculate the correspondence between the old cliques list and the new cliques list - if (parent[i] == -1 && !visited[i]) { - if (!toVisitArray[i]) { - toVisit.add(i); - toVisitArray[i] = true; - for (int j : cliques[i].neighborIndices) seenVariable[j] = true; - } - parent[i] = cursor; + for (int i = 0; i < cliques.length; i++) { + cachedCliquesBackPointers[i] = -1; + for (int j = 0; j < cachedCliqueList.length; j++) { + if (cliques[i] == cachedCliqueList[j]) { + cachedCliquesBackPointers[i] = j; + break; + } + } + if (cachedCliquesBackPointers[i] == -1) { + assert(forceRootForCachedMessagePassing == -1); + forceRootForCachedMessagePassing = i; + } } - } + assert(forceRootForCachedMessagePassing != -1); } - } - // No cycles in the tree - assert (parent[root] == -1); - } - assert (numVisited == cliques.length); + // Create the data structures to hold the tree pattern - // Backward pass, run the visited list in reverse + boolean[] visited = new boolean[cliques.length]; + int numVisited = 0; + int[] visitedOrder = new int[cliques.length]; - for (int i = numVisited - 1; i >= 0; i--) { - int cursor = visitedOrder[i]; - if (parent[cursor] == -1) continue; + int[] parent = new int[cliques.length]; + for (int i = 0; i < parent.length; i++) parent[i] = -1; + // Figure out which cliques are connected to which trees. This is important for calculating the partition + // function later, since each tree will converge to its own partition function by multiplication, and we will + // need to multiply the partition function of each of the trees to get the global one. + int[] trees = new int[cliques.length]; - backwardPassedMessages[cursor][parent[cursor]] = true; + // Forward pass, record a BFS forest pattern that we can use for message passing - // OPTIMIZATION: - // if these conditions are met we can avoid calculating the message, and instead retrieve from the cache, - // since they should be the same + int treeIndex = -1; + boolean[] seenVariable = new boolean[maxVar+1]; + while (numVisited < cliques.length) { + treeIndex++; - if (CACHE_MESSAGES - && forceRootForCachedMessagePassing != -1 - && cachedCliquesBackPointers[cursor] != -1 - && cachedCliquesBackPointers[parent[cursor]] != -1 - && cachedMessages[cachedCliquesBackPointers[cursor]][cachedCliquesBackPointers[parent[cursor]]] != null - && cachedBackwardPassedMessages[cachedCliquesBackPointers[cursor]][cachedCliquesBackPointers[parent[cursor]]]) { - messages[cursor][parent[cursor]] = - cachedMessages[cachedCliquesBackPointers[cursor]][cachedCliquesBackPointers[parent[cursor]]]; - } else { + // Pick the largest connected graph remaining as the root for message passing - // Calculate the message to the clique's parent, given all incoming messages so far + int root = -1; - TableFactor message = cliques[cursor]; - for (int k = 0; k < cliques.length; k++) { - if (k == parent[cursor]) continue; - if (messages[k][cursor] != null) { - message = message.multiply(messages[k][cursor]); - } - } - - messages[cursor][parent[cursor]] = marginalizeMessage(message, cliques[parent[cursor]].neighborIndices, marginalize); + // OPTIMIZATION: if there's a forced root for message passing (a node that we just added) then make it the + // root - // Invalidate any cached outgoing messages - if (CACHE_MESSAGES - && forceRootForCachedMessagePassing != -1 - && cachedCliquesBackPointers[parent[cursor]] != -1) { - for (int k = 0; k < cachedCliqueList.length; k++) { - cachedMessages[cachedCliquesBackPointers[parent[cursor]]][k] = null; - } + if (CACHE_MESSAGES && forceRootForCachedMessagePassing != -1 && !visited[forceRootForCachedMessagePassing]) { + root = forceRootForCachedMessagePassing; + } + else { + for (int i = 0; i < cliques.length; i++) { + if (!visited[i] && + (root == -1 || cliques[i].neighborIndices.length > cliques[root].neighborIndices.length)) { + root = i; + } + } + } + assert (root != -1); + + Queue toVisit = new ArrayDeque<>(); + toVisit.add(root); + boolean[] toVisitArray = new boolean[cliques.length]; + toVisitArray[root] = true; + + while (toVisit.size() > 0) { + int cursor = toVisit.poll(); + // toVisitArray[cursor] = false; + trees[cursor] = treeIndex; + if (visited[cursor]) { + System.err.println("Visited contains: " + cursor); + System.err.println("Visited: " + Arrays.toString(visited)); + System.err.println("To visit: " + toVisit); + } + assert (!visited[cursor]); + visited[cursor] = true; + visitedOrder[numVisited] = cursor; + for (int i : cliques[cursor].neighborIndices) seenVariable[i] = true; + numVisited++; + + childLoop: for (int i = 0; i < cliques.length; i++) { + if (i == cursor) continue; + if (i == parent[cursor]) continue; + if (domainsOverlap(cliques[cursor], cliques[i])) { + + // Make sure that for every variable that we've already seen somewhere in the graph, if it's + // in the child, it's in the parent. Otherwise we'll break the property of continuous + // transmission of information about variables through messages. + + childNeighborLoop: for (int child : cliques[i].neighborIndices) { + if (seenVariable[child]) { + for (int j : cliques[cursor].neighborIndices) { + if (j == child) { + continue childNeighborLoop; + } + } + // If we get here it means that this clique is not good as a child, since we can't pass + // it all the information it needs from other elements of the tree + continue childLoop; + } + } + + if (parent[i] == -1 && !visited[i]) { + if (!toVisitArray[i]) { + toVisit.add(i); + toVisitArray[i] = true; + for (int j : cliques[i].neighborIndices) seenVariable[j] = true; + } + parent[i] = cursor; + } + } + } + } + // No cycles in the tree + assert(parent[root] == -1); } - } - } - // Forward pass, run the visited list forward + assert (numVisited == cliques.length); - for (int i = 0; i < numVisited; i++) { - int cursor = visitedOrder[i]; - for (int j = 0; j < cliques.length; j++) { - if (parent[j] != cursor) continue; + // Backward pass, run the visited list in reverse - TableFactor message = cliques[cursor]; - for (int k = 0; k < cliques.length; k++) { - if (k == j) continue; - if (messages[k][cursor] != null) { - message = message.multiply(messages[k][cursor]); - } - } + for (int i = numVisited - 1; i >= 0; i--) { + int cursor = visitedOrder[i]; + if (parent[cursor] == -1) continue; - messages[cursor][j] = marginalizeMessage(message, cliques[j].neighborIndices, marginalize); - } - } + backwardPassedMessages[cursor][parent[cursor]] = true; - // OPTIMIZATION: - // cache the messages, and the current list of cliques + // OPTIMIZATION: + // if these conditions are met we can avoid calculating the message, and instead retrieve from the cache, + // since they should be the same - if (CACHE_MESSAGES) { - cachedCliqueList = cliques; - cachedMessages = messages; - cachedBackwardPassedMessages = backwardPassedMessages; - } + if (CACHE_MESSAGES + && forceRootForCachedMessagePassing != -1 + && cachedCliquesBackPointers[cursor] != -1 + && cachedCliquesBackPointers[parent[cursor]] != -1 + && cachedMessages[cachedCliquesBackPointers[cursor]][cachedCliquesBackPointers[parent[cursor]]] != null + && cachedBackwardPassedMessages[cachedCliquesBackPointers[cursor]][cachedCliquesBackPointers[parent[cursor]]]) { + messages[cursor][parent[cursor]] = + cachedMessages[cachedCliquesBackPointers[cursor]][cachedCliquesBackPointers[parent[cursor]]]; + } + else { - // Calculate final marginals for each variable + // Calculate the message to the clique's parent, given all incoming messages so far - double[][] marginals = new double[maxVar + 1][]; + TableFactor message = cliques[cursor]; + for (int k = 0; k < cliques.length; k++) { + if (k == parent[cursor]) continue; + if (messages[k][cursor] != null) { + message = message.multiply(messages[k][cursor]); + } + } - // Include observed variables as deterministic + messages[cursor][parent[cursor]] = marginalizeMessage(message, cliques[parent[cursor]].neighborIndices, marginalize); - for (GraphicalModel.Factor fac : model.factors) { - for (int i = 0; i < fac.neigborIndices.length; i++) { - int n = fac.neigborIndices[i]; - if (model.getVariableMetaDataByReference(n).containsKey(VARIABLE_OBSERVED_VALUE)) { - double[] deterministic = new double[fac.featuresTable.getDimensions()[i]]; - int assignment = Integer.parseInt(model.getVariableMetaDataByReference(n).get(VARIABLE_OBSERVED_VALUE)); - if (assignment > deterministic.length) { - throw new IllegalStateException("Variable " + n + ": Can't have as assignment (" + assignment + ") that is out of bounds for dimension size (" + deterministic.length + ")"); - } - deterministic[assignment] = 1.0; - marginals[n] = deterministic; + // Invalidate any cached outgoing messages + if (CACHE_MESSAGES + && forceRootForCachedMessagePassing != -1 + && cachedCliquesBackPointers[parent[cursor]] != -1) { + for (int k = 0; k < cachedCliqueList.length; k++) { + cachedMessages[cachedCliquesBackPointers[parent[cursor]]][k] = null; + } + } + } } - } - } - Map jointMarginals = new IdentityHashMap<>(); + // Forward pass, run the visited list forward - if (marginalize == MarginalizationMethod.SUM && includeJointMarginalsAndPartition) { - boolean[] partitionIncludesTrees = new boolean[treeIndex + 1]; - double[] treePartitionFunctions = new double[treeIndex + 1]; + for (int i = 0; i < numVisited; i++) { + int cursor = visitedOrder[i]; + for (int j = 0; j < cliques.length; j++) { + if (parent[j] != cursor) continue; - for (int i = 0; i < cliques.length; i++) { - TableFactor convergedClique = cliques[i]; + TableFactor message = cliques[cursor]; + for (int k = 0; k < cliques.length; k++) { + if (k == j) continue; + if (messages[k][cursor] != null) { + message = message.multiply(messages[k][cursor]); + } + } - for (int j = 0; j < cliques.length; j++) { - if (i == j) continue; - if (messages[j][i] == null) continue; - convergedClique = convergedClique.multiply(messages[j][i]); + messages[cursor][j] = marginalizeMessage(message, cliques[j].neighborIndices, marginalize); + } } - // Calculate the partition function when we're calculating marginals - // We need one contribution per tree in our forest graph - - if (!partitionIncludesTrees[trees[i]]) { - partitionIncludesTrees[trees[i]] = true; - treePartitionFunctions[trees[i]] = convergedClique.valueSum(); - partitionFunction *= treePartitionFunctions[trees[i]]; - } else { - - // This is all just an elaborate assert - // Check that our partition function is the same as the trees we're attached to, or with %.1, for numerical reasons. - // Sometimes the partition function will explode in value, which can make a non-%-based assert worthless here - - if (assertsEnabled()) { - double valueSum = convergedClique.valueSum(); - if (Double.isFinite(valueSum) && Double.isFinite(treePartitionFunctions[trees[i]])) { - if (Math.abs(treePartitionFunctions[trees[i]] - valueSum) >= 1.0e-3 * treePartitionFunctions[trees[i]]) { - System.err.println("Different partition functions for tree " + trees[i] + ": "); - System.err.println("Pre-existing for tree: " + treePartitionFunctions[trees[i]]); - System.err.println("This clique for tree: " + valueSum); - } - assert (Math.abs(treePartitionFunctions[trees[i]] - valueSum) < 1.0e-3 * treePartitionFunctions[trees[i]]); - } - } + // OPTIMIZATION: + // cache the messages, and the current list of cliques + + if (CACHE_MESSAGES) { + cachedCliqueList = cliques; + cachedMessages = messages; + cachedBackwardPassedMessages = backwardPassedMessages; } - // Calculate the factor this clique corresponds to, and put in an entry for joint marginals + // Calculate final marginals for each variable - GraphicalModel.Factor f = cliqueToFactor.get(i); - assert (f != null); - if (!jointMarginals.containsKey(f)) { - int[] observedAssignments = getObservedAssignments(f); + double[][] marginals = new double[maxVar + 1][]; - // Collect back pointers and check if this factor matches the clique we're using + // Include observed variables as deterministic - int[] backPointers = new int[observedAssignments.length]; - int cursor = 0; - for (int j = 0; j < observedAssignments.length; j++) { - if (observedAssignments[j] == -1) { - backPointers[j] = cursor; - cursor++; - } - // This is not strictly necessary but will trigger array OOB exception if things go wrong, so is nice - else backPointers[j] = -1; - } - - double sum = convergedClique.valueSum(); - - TableFactor jointMarginal = new TableFactor(f.neigborIndices, f.featuresTable.getDimensions()); - - // OPTIMIZATION: - // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, - // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. - Iterator fastPassByReferenceIterator = convergedClique.fastPassByReferenceIterator(); - int[] assignment = fastPassByReferenceIterator.next(); - while (true) { - if (backPointers.length == assignment.length) { - jointMarginal.setAssignmentValue(assignment, convergedClique.getAssignmentValue(assignment) / sum); - } else { - int[] jointAssignment = new int[backPointers.length]; - for (int j = 0; j < jointAssignment.length; j++) { - if (observedAssignments[j] != -1) jointAssignment[j] = observedAssignments[j]; - else jointAssignment[j] = assignment[backPointers[j]]; - } - jointMarginal.setAssignmentValue(jointAssignment, convergedClique.getAssignmentValue(assignment) / sum); + for (GraphicalModel.Factor fac : model.factors) { + for (int i = 0; i < fac.neigborIndices.length; i++) { + int n = fac.neigborIndices[i]; + if (model.getVariableMetaDataByReference(n).containsKey(VARIABLE_OBSERVED_VALUE)) { + double[] deterministic = new double[fac.featuresTable.getDimensions()[i]]; + int assignment = Integer.parseInt(model.getVariableMetaDataByReference(n).get(VARIABLE_OBSERVED_VALUE)); + if (assignment > deterministic.length) { + throw new IllegalStateException("Variable "+n+": Can't have as assignment ("+assignment+") that is out of bounds for dimension size ("+deterministic.length+")"); + } + deterministic[assignment] = 1.0; + marginals[n] = deterministic; + } } + } - // Set the assignment arrays correctly - if (fastPassByReferenceIterator.hasNext()) fastPassByReferenceIterator.next(); - else break; - } + Map jointMarginals = new IdentityHashMap<>(); - jointMarginals.put(f, jointMarginal); - } + if (marginalize == MarginalizationMethod.SUM && includeJointMarginalsAndPartition) { + boolean[] partitionIncludesTrees = new boolean[treeIndex + 1]; + double[] treePartitionFunctions = new double[treeIndex + 1]; - boolean anyNull = false; - for (int j = 0; j < convergedClique.neighborIndices.length; j++) { - int k = convergedClique.neighborIndices[j]; - if (marginals[k] == null) { - anyNull = true; - } - } + for (int i = 0; i < cliques.length; i++) { + TableFactor convergedClique = cliques[i]; + + for (int j = 0; j < cliques.length; j++) { + if (i == j) continue; + if (messages[j][i] == null) continue; + convergedClique = convergedClique.multiply(messages[j][i]); + } + + // Calculate the partition function when we're calculating marginals + // We need one contribution per tree in our forest graph + + if (!partitionIncludesTrees[trees[i]]) { + partitionIncludesTrees[trees[i]] = true; + treePartitionFunctions[trees[i]] = convergedClique.valueSum(); + partitionFunction *= treePartitionFunctions[trees[i]]; + } else { + + // This is all just an elaborate assert + // Check that our partition function is the same as the trees we're attached to, or with %.1, for numerical reasons. + // Sometimes the partition function will explode in value, which can make a non-%-based assert worthless here + + if (assertsEnabled()) { + double valueSum = convergedClique.valueSum(); + if (Double.isFinite(valueSum) && Double.isFinite(treePartitionFunctions[trees[i]])) { + if (Math.abs(treePartitionFunctions[trees[i]] - valueSum) >= 1.0e-3 * treePartitionFunctions[trees[i]]) { + System.err.println("Different partition functions for tree " + trees[i] + ": "); + System.err.println("Pre-existing for tree: " + treePartitionFunctions[trees[i]]); + System.err.println("This clique for tree: " + valueSum); + } + assert (Math.abs(treePartitionFunctions[trees[i]] - valueSum) < 1.0e-3 * treePartitionFunctions[trees[i]]); + } + } + } + + // Calculate the factor this clique corresponds to, and put in an entry for joint marginals + + GraphicalModel.Factor f = cliqueToFactor.get(i); + assert (f != null); + if (!jointMarginals.containsKey(f)) { + int[] observedAssignments = getObservedAssignments(f); + + // Collect back pointers and check if this factor matches the clique we're using + + int[] backPointers = new int[observedAssignments.length]; + int cursor = 0; + for (int j = 0; j < observedAssignments.length; j++) { + if (observedAssignments[j] == -1) { + backPointers[j] = cursor; + cursor++; + } + // This is not strictly necessary but will trigger array OOB exception if things go wrong, so is nice + else backPointers[j] = -1; + } + + double sum = convergedClique.valueSum(); + + TableFactor jointMarginal = new TableFactor(f.neigborIndices, f.featuresTable.getDimensions()); + + // OPTIMIZATION: + // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, + // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. + Iterator fastPassByReferenceIterator = convergedClique.fastPassByReferenceIterator(); + int[] assignment = fastPassByReferenceIterator.next(); + while (true) { + if (backPointers.length == assignment.length) { + jointMarginal.setAssignmentValue(assignment, convergedClique.getAssignmentValue(assignment) / sum); + } else { + int[] jointAssignment = new int[backPointers.length]; + for (int j = 0; j < jointAssignment.length; j++) { + if (observedAssignments[j] != -1) jointAssignment[j] = observedAssignments[j]; + else jointAssignment[j] = assignment[backPointers[j]]; + } + jointMarginal.setAssignmentValue(jointAssignment, convergedClique.getAssignmentValue(assignment) / sum); + } + + // Set the assignment arrays correctly + if (fastPassByReferenceIterator.hasNext()) fastPassByReferenceIterator.next(); + else break; + } + + jointMarginals.put(f, jointMarginal); + } + + boolean anyNull = false; + for (int j = 0; j < convergedClique.neighborIndices.length; j++) { + int k = convergedClique.neighborIndices[j]; + if (marginals[k] == null) { + anyNull = true; + } + } - if (anyNull) { - double[][] cliqueMarginals = null; - switch (marginalize) { - case SUM: - cliqueMarginals = convergedClique.getSummedMarginals(); - break; - case MAX: - cliqueMarginals = convergedClique.getMaxedMarginals(); - break; - } - for (int j = 0; j < convergedClique.neighborIndices.length; j++) { - int k = convergedClique.neighborIndices[j]; - if (marginals[k] == null) { - marginals[k] = cliqueMarginals[j]; + if (anyNull) { + double[][] cliqueMarginals = null; + switch (marginalize) { + case SUM: + cliqueMarginals = convergedClique.getSummedMarginals(); + break; + case MAX: + cliqueMarginals = convergedClique.getMaxedMarginals(); + break; + } + for (int j = 0; j < convergedClique.neighborIndices.length; j++) { + int k = convergedClique.neighborIndices[j]; + if (marginals[k] == null) { + marginals[k] = cliqueMarginals[j]; + } + } + } } - } - } - } - } - // If we don't care about joint marginals, we can be careful about not calculating more cliques than we need to, - // by explicitly sorting by which cliques are most profitable to calculate over. In this way we can avoid, in - // the case of a chain CRF, calculating almost half the joint factors. - else { - // First do a pass where we only calculate all-null neighbors - for (int i = 0; i < cliques.length; i++) { - boolean allNull = true; - for (int k : cliques[i].neighborIndices) { - if (marginals[k] != null) allNull = false; } - if (allNull) { - TableFactor convergedClique = cliques[i]; - - for (int j = 0; j < cliques.length; j++) { - if (i == j) continue; - if (messages[j][i] == null) continue; - convergedClique = convergedClique.multiply(messages[j][i]); - } - - double[][] cliqueMarginals = null; - switch (marginalize) { - case SUM: - cliqueMarginals = convergedClique.getSummedMarginals(); - break; - case MAX: - cliqueMarginals = convergedClique.getMaxedMarginals(); - break; - } - for (int j = 0; j < convergedClique.neighborIndices.length; j++) { - int k = convergedClique.neighborIndices[j]; - if (marginals[k] == null) { - marginals[k] = cliqueMarginals[j]; + // If we don't care about joint marginals, we can be careful about not calculating more cliques than we need to, + // by explicitly sorting by which cliques are most profitable to calculate over. In this way we can avoid, in + // the case of a chain CRF, calculating almost half the joint factors. + else { + // First do a pass where we only calculate all-null neighbors + for (int i = 0; i < cliques.length; i++) { + boolean allNull = true; + for (int k : cliques[i].neighborIndices) { + if (marginals[k] != null) allNull = false; + } + if (allNull) { + TableFactor convergedClique = cliques[i]; + + for (int j = 0; j < cliques.length; j++) { + if (i == j) continue; + if (messages[j][i] == null) continue; + convergedClique = convergedClique.multiply(messages[j][i]); + } + + double[][] cliqueMarginals = null; + switch (marginalize) { + case SUM: + cliqueMarginals = convergedClique.getSummedMarginals(); + break; + case MAX: + cliqueMarginals = convergedClique.getMaxedMarginals(); + break; + } + for (int j = 0; j < convergedClique.neighborIndices.length; j++) { + int k = convergedClique.neighborIndices[j]; + if (marginals[k] == null) { + marginals[k] = cliqueMarginals[j]; + } + } + } + } + // Now we calculate any remaining cliques with any non-null variables + for (int i = 0; i < cliques.length; i++) { + boolean anyNull = false; + for (int j = 0; j < cliques[i].neighborIndices.length; j++) { + int k = cliques[i].neighborIndices[j]; + if (marginals[k] == null) { + anyNull = true; + } + } + + if (anyNull) { + TableFactor convergedClique = cliques[i]; + + for (int j = 0; j < cliques.length; j++) { + if (i == j) continue; + if (messages[j][i] == null) continue; + convergedClique = convergedClique.multiply(messages[j][i]); + } + + double[][] cliqueMarginals = null; + switch (marginalize) { + case SUM: + cliqueMarginals = convergedClique.getSummedMarginals(); + break; + case MAX: + cliqueMarginals = convergedClique.getMaxedMarginals(); + break; + } + for (int j = 0; j < convergedClique.neighborIndices.length; j++) { + int k = convergedClique.neighborIndices[j]; + if (marginals[k] == null) { + marginals[k] = cliqueMarginals[j]; + } + } + } } - } - } - } - // Now we calculate any remaining cliques with any non-null variables - for (int i = 0; i < cliques.length; i++) { - boolean anyNull = false; - for (int j = 0; j < cliques[i].neighborIndices.length; j++) { - int k = cliques[i].neighborIndices[j]; - if (marginals[k] == null) { - anyNull = true; - } } - if (anyNull) { - TableFactor convergedClique = cliques[i]; - - for (int j = 0; j < cliques.length; j++) { - if (i == j) continue; - if (messages[j][i] == null) continue; - convergedClique = convergedClique.multiply(messages[j][i]); - } - - double[][] cliqueMarginals = null; - switch (marginalize) { - case SUM: - cliqueMarginals = convergedClique.getSummedMarginals(); - break; - case MAX: - cliqueMarginals = convergedClique.getMaxedMarginals(); - break; - } - for (int j = 0; j < convergedClique.neighborIndices.length; j++) { - int k = convergedClique.neighborIndices[j]; - if (marginals[k] == null) { - marginals[k] = cliqueMarginals[j]; + // Add any factors to the joint marginal map that were fully observed and so didn't get cliques + if (marginalize == MarginalizationMethod.SUM && includeJointMarginalsAndPartition) { + for (GraphicalModel.Factor f : model.factors) { + if (!jointMarginals.containsKey(f)) { + // This implies that every variable in the factor is observed. If that's the case, we need to construct + // a one hot TableFactor representing the deterministic distribution. + TableFactor deterministicJointMarginal = new TableFactor(f.neigborIndices, f.featuresTable.getDimensions()); + int[] observedAssignment = getObservedAssignments(f); + for (int i : observedAssignment) assert (i != -1); + deterministicJointMarginal.setAssignmentValue(observedAssignment, 1.0); + + jointMarginals.put(f, deterministicJointMarginal); + } } - } } - } + + return new MarginalResult(marginals, partitionFunction, jointMarginals); } - // Add any factors to the joint marginal map that were fully observed and so didn't get cliques - if (marginalize == MarginalizationMethod.SUM && includeJointMarginalsAndPartition) { - for (GraphicalModel.Factor f : model.factors) { - if (!jointMarginals.containsKey(f)) { - // This implies that every variable in the factor is observed. If that's the case, we need to construct - // a one hot TableFactor representing the deterministic distribution. - TableFactor deterministicJointMarginal = new TableFactor(f.neigborIndices, f.featuresTable.getDimensions()); - int[] observedAssignment = getObservedAssignments(f); - for (int i : observedAssignment) assert (i != -1); - deterministicJointMarginal.setAssignmentValue(observedAssignment, 1.0); - - jointMarginals.put(f, deterministicJointMarginal); + private int[] getObservedAssignments(GraphicalModel.Factor f) { + int[] observedAssignments = new int[f.neigborIndices.length]; + for (int i = 0; i < observedAssignments.length; i++) { + if (model.getVariableMetaDataByReference(f.neigborIndices[i]).containsKey(VARIABLE_OBSERVED_VALUE)) { + observedAssignments[i] = Integer.parseInt(model.getVariableMetaDataByReference(f.neigborIndices[i]).get(VARIABLE_OBSERVED_VALUE)); + } else observedAssignments[i] = -1; } - } + return observedAssignments; } - return new MarginalResult(marginals, partitionFunction, jointMarginals); - } + /** + * This is a key step in message passing. When we are calculating a message, we want to marginalize out all variables + * not relevant to the recipient of the message. This function does that. + * + * @param message the message to marginalize + * @param relevant the variables that are relevant + * @param marginalize whether to use sum of max marginalization, for marginal or MAP inference + * @return the marginalized message + */ + private TableFactor marginalizeMessage(TableFactor message, int[] relevant, MarginalizationMethod marginalize) { + TableFactor result = message; + + for (int i : message.neighborIndices) { + boolean contains = false; + for (int j : relevant) { + if (i == j) { + contains = true; + break; + } + } + if (!contains) { + switch (marginalize) { + case SUM: + result = result.sumOut(i); + break; + case MAX: + result = result.maxOut(i); + break; + } + } + } - private int[] getObservedAssignments(GraphicalModel.Factor f) { - int[] observedAssignments = new int[f.neigborIndices.length]; - for (int i = 0; i < observedAssignments.length; i++) { - if (model.getVariableMetaDataByReference(f.neigborIndices[i]).containsKey(VARIABLE_OBSERVED_VALUE)) { - observedAssignments[i] = Integer.parseInt(model.getVariableMetaDataByReference(f.neigborIndices[i]).get(VARIABLE_OBSERVED_VALUE)); - } else observedAssignments[i] = -1; + return result; } - return observedAssignments; - } - - /** - * This is a key step in message passing. When we are calculating a message, we want to marginalize out all variables - * not relevant to the recipient of the message. This function does that. - * - * @param message the message to marginalize - * @param relevant the variables that are relevant - * @param marginalize whether to use sum of max marginalization, for marginal or MAP inference - * @return the marginalized message - */ - private TableFactor marginalizeMessage(TableFactor message, int[] relevant, MarginalizationMethod marginalize) { - TableFactor result = message; - - for (int i : message.neighborIndices) { - boolean contains = false; - for (int j : relevant) { - if (i == j) { - contains = true; - break; - } - } - if (!contains) { - switch (marginalize) { - case SUM: - result = result.sumOut(i); - break; - case MAX: - result = result.maxOut(i); - break; + + /** + * Just a quick inline to check if two factors have overlapping domains. Since factor neighbor sets are super small, + * this n^2 algorithm is fine. + * + * @param f1 first factor to compare + * @param f2 second factor to compare + * @return whether their domains overlap + */ + private boolean domainsOverlap(TableFactor f1, TableFactor f2) { + for (int n1 : f1.neighborIndices) { + for (int n2 : f2.neighborIndices) { + if (n1 == n2) return true; + } } - } + return false; } - return result; - } - - /** - * Just a quick inline to check if two factors have overlapping domains. Since factor neighbor sets are super small, - * this n^2 algorithm is fine. - * - * @param f1 first factor to compare - * @param f2 second factor to compare - * @return whether their domains overlap - */ - private boolean domainsOverlap(TableFactor f1, TableFactor f2) { - for (int n1 : f1.neighborIndices) { - for (int n2 : f2.neighborIndices) { - if (n1 == n2) return true; - } + @SuppressWarnings("*") + private boolean assertsEnabled() { + boolean assertsEnabled = false; + assert(assertsEnabled = true); // intentional side effect + return assertsEnabled; } - return false; - } - - @SuppressWarnings("*") - private boolean assertsEnabled() { - boolean assertsEnabled = false; - assert (assertsEnabled = true); // intentional side effect - return assertsEnabled; - } } diff --git a/src/edu/stanford/nlp/loglinear/inference/TableFactor.java b/src/edu/stanford/nlp/loglinear/inference/TableFactor.java index a863c0a8e1..5297ee21cf 100644 --- a/src/edu/stanford/nlp/loglinear/inference/TableFactor.java +++ b/src/edu/stanford/nlp/loglinear/inference/TableFactor.java @@ -11,669 +11,679 @@ /** * Created by keenon on 8/11/15. - *

+ * * Holds a factor populated by doubles that knows how to do all the important operations for PGM inference. Internally, * these are just different flavors of two basic data-flow operations: - *

+ * * - Factor product * - Factor marginalization - *

+ * * The output here is different ways to grow and shrink factors that turn out to be useful for downstream uses in PGMs. * Basically, we care about message passing, as that will be the primary operation. - *

+ * * Everything is represented as log-linear, because the primary use for TableFactor is in CliqueTree, and that is * intended for use with log-linear models. */ public class TableFactor extends NDArrayDoubles { - public int[] neighborIndices; - - /** - * Construct a TableFactor for inference within a model. This just copies the important bits from the model factor, - * and replaces the ConcatVectorTable with an internal datastructure that has done all the dotproducts with the - * weights out, and so stores only doubles. - *

- * Each element of the table is given by: t_i = exp(f_i*w) - * - * @param weights the vector to dot product with every element of the factor table - * @param factor the feature factor to be multiplied in - */ - public TableFactor(ConcatVector weights, GraphicalModel.Factor factor) { - super(factor.featuresTable.getDimensions()); - this.neighborIndices = factor.neigborIndices; - - // Calculate the factor residents by dot product with the weights - - // OPTIMIZATION: - // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, - // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. - Iterator fastPassByReferenceIterator = factor.featuresTable.fastPassByReferenceIterator(); - int[] assignment = fastPassByReferenceIterator.next(); - while (true) { - setAssignmentLogValue(assignment, factor.featuresTable.getAssignmentValue(assignment).get().dotProduct(weights)); - // This mutates the assignment[] array, rather than creating a new one - if (fastPassByReferenceIterator.hasNext()) fastPassByReferenceIterator.next(); - else break; - } - } - - /** - * Construct a TableFactor for inference within a model. This is the same as the other constructor, except that the - * table is observed out before any unnecessary dot products are done out, so hopefully we dramatically reduce the - * number of computations required to calculate the resulting table. - *

- * Each element of the table is given by: t_i = exp(f_i*w) - * - * @param weights the vector to dot product with every element of the factor table - * @param factor the feature factor to be multiplied in - */ - public TableFactor(ConcatVector weights, GraphicalModel.Factor factor, int[] observations) { - super(); - assert (observations.length == factor.neigborIndices.length); - - int size = 0; - for (int observation : observations) if (observation == -1) size++; - - neighborIndices = new int[size]; - dimensions = new int[size]; - int[] forwardPointers = new int[size]; - int[] factorAssignment = new int[factor.neigborIndices.length]; - - int cursor = 0; - for (int i = 0; i < factor.neigborIndices.length; i++) { - if (observations[i] == -1) { - neighborIndices[cursor] = factor.neigborIndices[i]; - dimensions[cursor] = factor.featuresTable.getDimensions()[i]; - forwardPointers[cursor] = i; - cursor++; - } else factorAssignment[i] = observations[i]; + public int[] neighborIndices; + + /** + * Construct a TableFactor for inference within a model. This just copies the important bits from the model factor, + * and replaces the ConcatVectorTable with an internal datastructure that has done all the dotproducts with the + * weights out, and so stores only doubles. + * + * Each element of the table is given by: t_i = exp(f_i*w) + * + * @param weights the vector to dot product with every element of the factor table + * @param factor the feature factor to be multiplied in + */ + public TableFactor(ConcatVector weights, GraphicalModel.Factor factor) { + super(factor.featuresTable.getDimensions()); + this.neighborIndices = factor.neigborIndices; + + // Calculate the factor residents by dot product with the weights + + // OPTIMIZATION: + // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, + // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. + Iterator fastPassByReferenceIterator = factor.featuresTable.fastPassByReferenceIterator(); + int[] assignment = fastPassByReferenceIterator.next(); + while (true) { + setAssignmentLogValue(assignment, factor.featuresTable.getAssignmentValue(assignment).get().dotProduct(weights)); + // This mutates the assignment[] array, rather than creating a new one + if (fastPassByReferenceIterator.hasNext()) fastPassByReferenceIterator.next(); + else break; + } } - assert (cursor == size); - - values = new double[combinatorialNeighborStatesCount()]; - for (int[] assn : this) { - for (int i = 0; i < assn.length; i++) { - factorAssignment[forwardPointers[i]] = assn[i]; - } - setAssignmentLogValue(assn, factor.featuresTable.getAssignmentValue(factorAssignment).get().dotProduct(weights)); - } - } - - /** - * FOR PRIVATE USE AND TESTING ONLY - */ - TableFactor(int[] neighborIndices, int[] dimensions) { - super(dimensions); - this.neighborIndices = neighborIndices; - for (int i = 0; i < values.length; i++) { - values[i] = Double.NEGATIVE_INFINITY; - } - } - - /** - * Remove a variable by observing it at a certain value, return a new factor without that variable. - * - * @param variable the variable to be observed - * @param value the value the variable takes when observed - * @return a new factor with 'variable' in it - */ - public TableFactor observe(int variable, final int value) { - return marginalize(variable, 0, (marginalizedVariableValue, assignment) -> { - if (marginalizedVariableValue == value) { - return (old, n) -> { - // This would mean that we're observing something with 0 probability, which will wonk up downstream - // stuff - // assert(n != 0); - return n; - }; - } else { - return (old, n) -> old; - } - }); - } - - /** - * Returns the summed marginals for each element in the factor. These are represented in log space, and are summed - * using the numerically stable variant, even though it's slightly slower. - * - * @return an array of doubles one-to-one with variable states for each variable - */ - public double[][] getSummedMarginals() { - double[][] results = new double[neighborIndices.length][]; - for (int i = 0; i < neighborIndices.length; i++) { - results[i] = new double[getDimensions()[i]]; - } + /** + * Construct a TableFactor for inference within a model. This is the same as the other constructor, except that the + * table is observed out before any unnecessary dot products are done out, so hopefully we dramatically reduce the + * number of computations required to calculate the resulting table. + * + * Each element of the table is given by: t_i = exp(f_i*w) + * + * @param weights the vector to dot product with every element of the factor table + * @param factor the feature factor to be multiplied in + */ + public TableFactor(ConcatVector weights, GraphicalModel.Factor factor, int[] observations) { + super(); + assert(observations.length == factor.neigborIndices.length); + + int size = 0; + for (int observation : observations) if (observation == -1) size++; + + neighborIndices = new int[size]; + dimensions = new int[size]; + int[] forwardPointers = new int[size]; + int[] factorAssignment = new int[factor.neigborIndices.length]; + + int cursor = 0; + for (int i = 0; i < factor.neigborIndices.length; i++) { + if (observations[i] == -1) { + neighborIndices[cursor] = factor.neigborIndices[i]; + dimensions[cursor] = factor.featuresTable.getDimensions()[i]; + forwardPointers[cursor] = i; + cursor++; + } + else factorAssignment[i] = observations[i]; + } + assert(cursor == size); - double[][] maxValues = new double[neighborIndices.length][]; - for (int i = 0; i < neighborIndices.length; i++) { - maxValues[i] = new double[getDimensions()[i]]; - for (int j = 0; j < maxValues[i].length; j++) maxValues[i][j] = Double.NEGATIVE_INFINITY; - } + values = new double[combinatorialNeighborStatesCount()]; - // Get max values - - // OPTIMIZATION: - // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, - // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. - - Iterator fastPassByReferenceIterator = fastPassByReferenceIterator(); - int[] assignment = fastPassByReferenceIterator.next(); - while (true) { - double v = getAssignmentLogValue(assignment); - for (int i = 0; i < neighborIndices.length; i++) { - if (maxValues[i][assignment[i]] < v) maxValues[i][assignment[i]] = v; - } - // This mutates the resultAssignment[] array, rather than creating a new one - if (fastPassByReferenceIterator.hasNext()) { - fastPassByReferenceIterator.next(); - } else break; + for (int[] assn : this) { + for (int i = 0; i < assn.length; i++) { + factorAssignment[forwardPointers[i]] = assn[i]; + } + setAssignmentLogValue(assn, factor.featuresTable.getAssignmentValue(factorAssignment).get().dotProduct(weights)); + } } - // Do the summation - - // OPTIMIZATION: - // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, - // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. - - Iterator secondFastPassByReferenceIterator = fastPassByReferenceIterator(); - assignment = secondFastPassByReferenceIterator.next(); - while (true) { - double v = getAssignmentLogValue(assignment); - for (int i = 0; i < neighborIndices.length; i++) { - results[i][assignment[i]] += Math.exp(v - maxValues[i][assignment[i]]); - } - // This mutates the resultAssignment[] array, rather than creating a new one - if (secondFastPassByReferenceIterator.hasNext()) { - secondFastPassByReferenceIterator.next(); - } else break; + /** + * Remove a variable by observing it at a certain value, return a new factor without that variable. + * + * @param variable the variable to be observed + * @param value the value the variable takes when observed + * @return a new factor with 'variable' in it + */ + public TableFactor observe(int variable, final int value) { + return marginalize(variable, 0, (marginalizedVariableValue, assignment) -> { + if (marginalizedVariableValue == value) { + return (old,n) -> { + // This would mean that we're observing something with 0 probability, which will wonk up downstream + // stuff + // assert(n != 0); + return n; + }; + } + else { + return (old,n) -> old; + } + }); } - // normalize results, and move to linear space - - for (int i = 0; i < neighborIndices.length; i++) { - double sum = 0.0; - for (int j = 0; j < results[i].length; j++) { - results[i][j] = Math.exp(maxValues[i][j]) * results[i][j]; - sum += results[i][j]; - } - if (Double.isInfinite(sum)) { - for (int j = 0; j < results[i].length; j++) { - results[i][j] = 1.0 / results[i].length; + /** + * Returns the summed marginals for each element in the factor. These are represented in log space, and are summed + * using the numerically stable variant, even though it's slightly slower. + * + * @return an array of doubles one-to-one with variable states for each variable + */ + public double[][] getSummedMarginals() { + double[][] results = new double[neighborIndices.length][]; + for (int i = 0; i < neighborIndices.length; i++) { + results[i] = new double[getDimensions()[i]]; } - } else { - for (int j = 0; j < results[i].length; j++) { - results[i][j] /= sum; + + double[][] maxValues = new double[neighborIndices.length][]; + for (int i = 0; i < neighborIndices.length; i++) { + maxValues[i] = new double[getDimensions()[i]]; + for (int j = 0; j < maxValues[i].length; j++) maxValues[i][j] = Double.NEGATIVE_INFINITY; } - } - } - return results; - } - - /** - * Convenience function to max out all but one variable, and return the marginal array. - * - * @return an array of doubles one-to-one with variable states for each variable - */ - public double[][] getMaxedMarginals() { - double[][] maxValues = new double[neighborIndices.length][]; - for (int i = 0; i < neighborIndices.length; i++) { - maxValues[i] = new double[getDimensions()[i]]; - for (int j = 0; j < maxValues[i].length; j++) maxValues[i][j] = Double.NEGATIVE_INFINITY; - } + // Get max values - // Get max values - - // OPTIMIZATION: - // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, - // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. - - Iterator fastPassByReferenceIterator = fastPassByReferenceIterator(); - int[] assignment = fastPassByReferenceIterator.next(); - while (true) { - double v = getAssignmentLogValue(assignment); - for (int i = 0; i < neighborIndices.length; i++) { - if (maxValues[i][assignment[i]] < v) maxValues[i][assignment[i]] = v; - } - // This mutates the resultAssignment[] array, rather than creating a new one - if (fastPassByReferenceIterator.hasNext()) { - fastPassByReferenceIterator.next(); - } else break; - } + // OPTIMIZATION: + // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, + // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. - for (int i = 0; i < neighborIndices.length; i++) { - normalizeLogArr(maxValues[i]); - } + Iterator fastPassByReferenceIterator = fastPassByReferenceIterator(); + int[] assignment = fastPassByReferenceIterator.next(); + while (true) { + double v = getAssignmentLogValue(assignment); + for (int i = 0; i < neighborIndices.length; i++) { + if (maxValues[i][assignment[i]] < v) maxValues[i][assignment[i]] = v; + } + // This mutates the resultAssignment[] array, rather than creating a new one + if (fastPassByReferenceIterator.hasNext()) { + fastPassByReferenceIterator.next(); + } + else break; + } + + // Do the summation + + // OPTIMIZATION: + // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, + // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. - return maxValues; - } - - /** - * Marginalize out a variable by taking the max value. - * - * @param variable the variable to be maxed out. - * @return a table factor that will contain the largest value of the variable being marginalized out. - */ - public TableFactor maxOut(int variable) { - return marginalize(variable, Double.NEGATIVE_INFINITY, (marginalizedVariableValue, assignment) -> Math::max); - } - - /** - * Marginalize out a variable by taking a sum. - * - * @param variable the variable to be summed out - * @return a factor with variable removed - */ - public TableFactor sumOut(int variable) { - - // OPTIMIZATION: This is by far the most common case, for linear chain inference, and is worth making fast - // We can use closed loops, and not bother with using the basic iterator to loop through indices. - // If this special case doesn't trip, we fall back to the standard (but slower) algorithm for the general case - - if (getDimensions().length == 2) { - if (neighborIndices[0] == variable) { - TableFactor marginalized = new TableFactor(new int[]{neighborIndices[1]}, new int[]{getDimensions()[1]}); - - for (int i = 0; i < marginalized.values.length; i++) marginalized.values[i] = 0; - - // We use the stable log-sum-exp trick here, so first we calculate the max - - double[] max = new double[getDimensions()[1]]; - for (int j = 0; j < getDimensions()[1]; j++) { - max[j] = Double.NEGATIVE_INFINITY; - } - - for (int i = 0; i < getDimensions()[0]; i++) { - int k = i * getDimensions()[1]; - for (int j = 0; j < getDimensions()[1]; j++) { - int index = k + j; - if (values[index] > max[j]) { - max[j] = values[index]; + Iterator secondFastPassByReferenceIterator = fastPassByReferenceIterator(); + assignment = secondFastPassByReferenceIterator.next(); + while (true) { + double v = getAssignmentLogValue(assignment); + for (int i = 0; i < neighborIndices.length; i++) { + results[i][assignment[i]] += Math.exp(v - maxValues[i][assignment[i]]); } - } + // This mutates the resultAssignment[] array, rather than creating a new one + if (secondFastPassByReferenceIterator.hasNext()) { + secondFastPassByReferenceIterator.next(); + } + else break; } - // Then we take the sum, minus the max + // normalize results, and move to linear space - for (int i = 0; i < getDimensions()[0]; i++) { - int k = i * getDimensions()[1]; - for (int j = 0; j < getDimensions()[1]; j++) { - int index = k + j; - if (Double.isFinite(max[j])) { - marginalized.values[j] += Math.exp(values[index] - max[j]); + for (int i = 0; i < neighborIndices.length; i++) { + double sum = 0.0; + for (int j = 0; j < results[i].length; j++) { + results[i][j] = Math.exp(maxValues[i][j]) * results[i][j]; + sum += results[i][j]; + } + if (Double.isInfinite(sum)) { + for (int j = 0; j < results[i].length; j++) { + results[i][j] = 1.0 / results[i].length; + } + } + else { + for (int j = 0; j < results[i].length; j++) { + results[i][j] /= sum; + } } - } } - // And now we exponentiate, and add back in the values + return results; + } - for (int j = 0; j < getDimensions()[1]; j++) { - if (Double.isFinite(max[j])) { - marginalized.values[j] = max[j] + Math.log(marginalized.values[j]); - } else { - marginalized.values[j] = max[j]; - } + /** + * Convenience function to max out all but one variable, and return the marginal array. + * + * @return an array of doubles one-to-one with variable states for each variable + */ + public double[][] getMaxedMarginals() { + double[][] maxValues = new double[neighborIndices.length][]; + for (int i = 0; i < neighborIndices.length; i++) { + maxValues[i] = new double[getDimensions()[i]]; + for (int j = 0; j < maxValues[i].length; j++) maxValues[i][j] = Double.NEGATIVE_INFINITY; } - return marginalized; - } else { - assert (neighborIndices[1] == variable); - TableFactor marginalized = new TableFactor(new int[]{neighborIndices[0]}, new int[]{getDimensions()[0]}); + // Get max values - for (int i = 0; i < marginalized.values.length; i++) marginalized.values[i] = 0; + // OPTIMIZATION: + // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, + // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. - // We use the stable log-sum-exp trick here, so first we calculate the max + Iterator fastPassByReferenceIterator = fastPassByReferenceIterator(); + int[] assignment = fastPassByReferenceIterator.next(); + while (true) { + double v = getAssignmentLogValue(assignment); + for (int i = 0; i < neighborIndices.length; i++) { + if (maxValues[i][assignment[i]] < v) maxValues[i][assignment[i]] = v; + } + // This mutates the resultAssignment[] array, rather than creating a new one + if (fastPassByReferenceIterator.hasNext()) { + fastPassByReferenceIterator.next(); + } + else break; + } - double[] max = new double[getDimensions()[0]]; - for (int i = 0; i < getDimensions()[0]; i++) { - max[i] = Double.NEGATIVE_INFINITY; + for (int i = 0; i < neighborIndices.length; i++) { + normalizeLogArr(maxValues[i]); } - for (int i = 0; i < getDimensions()[0]; i++) { - int k = i * getDimensions()[1]; - for (int j = 0; j < getDimensions()[1]; j++) { - int index = k + j; - if (values[index] > max[i]) { - max[i] = values[index]; + return maxValues; + } + + /** + * Marginalize out a variable by taking the max value. + * + * @param variable the variable to be maxed out. + * @return a table factor that will contain the largest value of the variable being marginalized out. + */ + public TableFactor maxOut(int variable) { + return marginalize(variable, Double.NEGATIVE_INFINITY, (marginalizedVariableValue, assignment) -> Math::max); + } + + /** + * Marginalize out a variable by taking a sum. + * + * @param variable the variable to be summed out + * @return a factor with variable removed + */ + public TableFactor sumOut(int variable) { + + // OPTIMIZATION: This is by far the most common case, for linear chain inference, and is worth making fast + // We can use closed loops, and not bother with using the basic iterator to loop through indices. + // If this special case doesn't trip, we fall back to the standard (but slower) algorithm for the general case + + if (getDimensions().length == 2) { + if (neighborIndices[0] == variable) { + TableFactor marginalized = new TableFactor(new int[]{neighborIndices[1]}, new int[]{getDimensions()[1]}); + + for (int i = 0; i < marginalized.values.length; i++) marginalized.values[i] = 0; + + // We use the stable log-sum-exp trick here, so first we calculate the max + + double[] max = new double[getDimensions()[1]]; + for (int j = 0; j < getDimensions()[1]; j++) { + max[j] = Double.NEGATIVE_INFINITY; + } + + for (int i = 0; i < getDimensions()[0]; i++) { + int k = i * getDimensions()[1]; + for (int j = 0; j < getDimensions()[1]; j++) { + int index = k + j; + if (values[index] > max[j]) { + max[j] = values[index]; + } + } + } + + // Then we take the sum, minus the max + + for (int i = 0; i < getDimensions()[0]; i++) { + int k = i * getDimensions()[1]; + for (int j = 0; j < getDimensions()[1]; j++) { + int index = k + j; + if (Double.isFinite(max[j])) { + marginalized.values[j] += Math.exp(values[index] - max[j]); + } + } + } + + // And now we exponentiate, and add back in the values + + for (int j = 0; j < getDimensions()[1]; j++) { + if (Double.isFinite(max[j])) { + marginalized.values[j] = max[j] + Math.log(marginalized.values[j]); + } + else { + marginalized.values[j] = max[j]; + } + } + + return marginalized; + } + else { + assert(neighborIndices[1] == variable); + TableFactor marginalized = new TableFactor(new int[]{neighborIndices[0]}, new int[]{getDimensions()[0]}); + + for (int i = 0; i < marginalized.values.length; i++) marginalized.values[i] = 0; + + // We use the stable log-sum-exp trick here, so first we calculate the max + + double[] max = new double[getDimensions()[0]]; + for (int i = 0; i < getDimensions()[0]; i++) { + max[i] = Double.NEGATIVE_INFINITY; + } + + for (int i = 0; i < getDimensions()[0]; i++) { + int k = i * getDimensions()[1]; + for (int j = 0; j < getDimensions()[1]; j++) { + int index = k + j; + if (values[index] > max[i]) { + max[i] = values[index]; + } + } + } + + // Then we take the sum, minus the max + + for (int i = 0; i < getDimensions()[0]; i++) { + int k = i * getDimensions()[1]; + for (int j = 0; j < getDimensions()[1]; j++) { + int index = k + j; + if (Double.isFinite(max[i])) { + marginalized.values[i] += Math.exp(values[index] - max[i]); + } + } + } + + // And now we exponentiate, and add back in the values + + for (int i = 0; i < getDimensions()[0]; i++) { + if (Double.isFinite(max[i])) { + marginalized.values[i] = max[i] + Math.log(marginalized.values[i]); + } + else { + marginalized.values[i] = max[i]; + } + } + + return marginalized; } - } } + else { + // This is a little tricky because we need to use the stable log-sum-exp trick on top of our marginalize + // dataflow operation. + + // First we calculate all the max values to use as pivots to prevent overflow + TableFactor maxValues = maxOut(variable); - // Then we take the sum, minus the max + // Then we do the sum against an offset from the pivots + TableFactor marginalized = marginalize(variable, 0, (marginalizedVariableValue, assignment) -> (a, b) -> a + Math.exp(b - maxValues.getAssignmentLogValue(assignment))); - for (int i = 0; i < getDimensions()[0]; i++) { - int k = i * getDimensions()[1]; - for (int j = 0; j < getDimensions()[1]; j++) { - int index = k + j; - if (Double.isFinite(max[i])) { - marginalized.values[i] += Math.exp(values[index] - max[i]); + // Then we factor the max values back in, and + for (int[] assignment : marginalized) { + marginalized.setAssignmentLogValue(assignment, maxValues.getAssignmentLogValue(assignment) + Math.log(marginalized.getAssignmentLogValue(assignment))); } - } + + return marginalized; } + } + + /** + * Product two factors, taking the multiplication at the intersections. + * @param other the other factor to be multiplied + * @return a factor containing the union of both variable sets + */ + public TableFactor multiply(TableFactor other) { - // And now we exponentiate, and add back in the values + // Calculate the result domain - for (int i = 0; i < getDimensions()[0]; i++) { - if (Double.isFinite(max[i])) { - marginalized.values[i] = max[i] + Math.log(marginalized.values[i]); - } else { - marginalized.values[i] = max[i]; - } + List domain = new ArrayList<>(); + List otherDomain = new ArrayList<>(); + List resultDomain = new ArrayList<>(); + + for (int n : neighborIndices) { + domain.add(n); + resultDomain.add(n); + } + for (int n : other.neighborIndices) { + otherDomain.add(n); + if (!resultDomain.contains(n)) resultDomain.add(n); } - return marginalized; - } - } else { - // This is a little tricky because we need to use the stable log-sum-exp trick on top of our marginalize - // dataflow operation. + // Create result TableFactor + + int[] resultNeighborIndices = new int[resultDomain.size()]; + int[] resultDimensions = new int[resultNeighborIndices.length]; + for (int i = 0; i < resultDomain.size(); i++) { + int var = resultDomain.get(i); + resultNeighborIndices[i] = var; + // assert consistency about variable size, we can't have the same variable with two different sizes + assert((getVariableSize(var) == 0 && other.getVariableSize(var) > 0) || + (getVariableSize(var) > 0 && other.getVariableSize(var) == 0) || + (getVariableSize(var) == other.getVariableSize(var))); + resultDimensions[i] = Math.max(getVariableSize(resultDomain.get(i)), other.getVariableSize(resultDomain.get(i))); + } + TableFactor result = new TableFactor(resultNeighborIndices, resultDimensions); + + // OPTIMIZATION: + // If we're a factor of size 2 receiving a message of size 1, then we can optimize that pretty heavily + // We could just use the general algorithm at the end of this set of special cases, but this is the fastest way + if (otherDomain.size() == 1 && (resultDomain.size() == domain.size()) && domain.size() == 2) { + int msgVar = otherDomain.get(0); + int msgIndex = resultDomain.indexOf(msgVar); + + if (msgIndex == 0) { + for (int i = 0; i < resultDimensions[0]; i++) { + double d = other.values[i]; + int k = i * resultDimensions[1]; + for (int j = 0; j < resultDimensions[1]; j++) { + int index = k + j; + result.values[index] = values[index] + d; + } + } + } + else if (msgIndex == 1) { + for (int i = 0; i < resultDimensions[0]; i++) { + int k = i * resultDimensions[1]; + for (int j = 0; j < resultDimensions[1]; j++) { + int index = k + j; + result.values[index] = values[index] + other.values[j]; + } + } + } + } + // OPTIMIZATION: + // The special case where we're a message of size 1, and the other factor is receiving the message, and of size 2 + else if (domain.size() == 1 && (resultDomain.size() == otherDomain.size()) && resultDomain.size() == 2) { + return other.multiply(this); + } + // Otherwise we follow the big comprehensive, slow general purpose algorithm + else { - // First we calculate all the max values to use as pivots to prevent overflow - TableFactor maxValues = maxOut(variable); + // Calculate back-pointers from the result domain indices to original indices - // Then we do the sum against an offset from the pivots - TableFactor marginalized = marginalize(variable, 0, (marginalizedVariableValue, assignment) -> (a, b) -> a + Math.exp(b - maxValues.getAssignmentLogValue(assignment))); + int[] mapping = new int[result.neighborIndices.length]; + int[] otherMapping = new int[result.neighborIndices.length]; + for (int i = 0; i < result.neighborIndices.length; i++) { + mapping[i] = domain.indexOf(result.neighborIndices[i]); + otherMapping[i] = otherDomain.indexOf(result.neighborIndices[i]); + } - // Then we factor the max values back in, and - for (int[] assignment : marginalized) { - marginalized.setAssignmentLogValue(assignment, maxValues.getAssignmentLogValue(assignment) + Math.log(marginalized.getAssignmentLogValue(assignment))); - } + // Do the actual joining operation between the two tables, applying 'join' for each result element. + + int[] assignment = new int[neighborIndices.length]; + int[] otherAssignment = new int[other.neighborIndices.length]; + + // OPTIMIZATION: + // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, + // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. + Iterator fastPassByReferenceIterator = result.fastPassByReferenceIterator(); + int[] resultAssignment = fastPassByReferenceIterator.next(); + while (true) { + // Set the assignment arrays correctly + for (int i = 0; i < resultAssignment.length; i++) { + if (mapping[i] != -1) assignment[mapping[i]] = resultAssignment[i]; + if (otherMapping[i] != -1) otherAssignment[otherMapping[i]] = resultAssignment[i]; + } + result.setAssignmentLogValue(resultAssignment, getAssignmentLogValue(assignment) + other.getAssignmentLogValue(otherAssignment)); + // This mutates the resultAssignment[] array, rather than creating a new one + if (fastPassByReferenceIterator.hasNext()) fastPassByReferenceIterator.next(); + else break; + } + } - return marginalized; + return result; } - } - - /** - * Product two factors, taking the multiplication at the intersections. - * - * @param other the other factor to be multiplied - * @return a factor containing the union of both variable sets - */ - public TableFactor multiply(TableFactor other) { - // Calculate the result domain + /** + * This is useful for calculating the partition function, and is exposed here because when implemented internally + * we can do a much more numerically stable summation. + * + * @return the sum of all values for all assignments to the TableFactor + */ + public double valueSum() { + + // We want the exp(log-sum-exp), for stability + // This rearranges to exp(a)*(sum-exp) + + double max = 0.0; + for (int[] assignment : this) { + double v = getAssignmentLogValue(assignment); + if (v > max) { + max = v; + } + } - List domain = new ArrayList<>(); - List otherDomain = new ArrayList<>(); - List resultDomain = new ArrayList<>(); + double sumExp = 0.0; + for (int[] assignment : this) { + sumExp += Math.exp(getAssignmentLogValue(assignment) - max); + } - for (int n : neighborIndices) { - domain.add(n); - resultDomain.add(n); - } - for (int n : other.neighborIndices) { - otherDomain.add(n); - if (!resultDomain.contains(n)) resultDomain.add(n); + return sumExp * Math.exp(max); } - // Create result TableFactor - - int[] resultNeighborIndices = new int[resultDomain.size()]; - int[] resultDimensions = new int[resultNeighborIndices.length]; - for (int i = 0; i < resultDomain.size(); i++) { - int var = resultDomain.get(i); - resultNeighborIndices[i] = var; - // assert consistency about variable size, we can't have the same variable with two different sizes - assert ((getVariableSize(var) == 0 && other.getVariableSize(var) > 0) || - (getVariableSize(var) > 0 && other.getVariableSize(var) == 0) || - (getVariableSize(var) == other.getVariableSize(var))); - resultDimensions[i] = Math.max(getVariableSize(resultDomain.get(i)), other.getVariableSize(resultDomain.get(i))); - } - TableFactor result = new TableFactor(resultNeighborIndices, resultDimensions); - - // OPTIMIZATION: - // If we're a factor of size 2 receiving a message of size 1, then we can optimize that pretty heavily - // We could just use the general algorithm at the end of this set of special cases, but this is the fastest way - if (otherDomain.size() == 1 && (resultDomain.size() == domain.size()) && domain.size() == 2) { - int msgVar = otherDomain.get(0); - int msgIndex = resultDomain.indexOf(msgVar); - - if (msgIndex == 0) { - for (int i = 0; i < resultDimensions[0]; i++) { - double d = other.values[i]; - int k = i * resultDimensions[1]; - for (int j = 0; j < resultDimensions[1]; j++) { - int index = k + j; - result.values[index] = values[index] + d; - } - } - } else if (msgIndex == 1) { - for (int i = 0; i < resultDimensions[0]; i++) { - int k = i * resultDimensions[1]; - for (int j = 0; j < resultDimensions[1]; j++) { - int index = k + j; - result.values[index] = values[index] + other.values[j]; - } - } - } - } - // OPTIMIZATION: - // The special case where we're a message of size 1, and the other factor is receiving the message, and of size 2 - else if (domain.size() == 1 && (resultDomain.size() == otherDomain.size()) && resultDomain.size() == 2) { - return other.multiply(this); - } - // Otherwise we follow the big comprehensive, slow general purpose algorithm - else { - - // Calculate back-pointers from the result domain indices to original indices - - int[] mapping = new int[result.neighborIndices.length]; - int[] otherMapping = new int[result.neighborIndices.length]; - for (int i = 0; i < result.neighborIndices.length; i++) { - mapping[i] = domain.indexOf(result.neighborIndices[i]); - otherMapping[i] = otherDomain.indexOf(result.neighborIndices[i]); - } - - // Do the actual joining operation between the two tables, applying 'join' for each result element. - - int[] assignment = new int[neighborIndices.length]; - int[] otherAssignment = new int[other.neighborIndices.length]; - - // OPTIMIZATION: - // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, - // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. - Iterator fastPassByReferenceIterator = result.fastPassByReferenceIterator(); - int[] resultAssignment = fastPassByReferenceIterator.next(); - while (true) { - // Set the assignment arrays correctly - for (int i = 0; i < resultAssignment.length; i++) { - if (mapping[i] != -1) assignment[mapping[i]] = resultAssignment[i]; - if (otherMapping[i] != -1) otherAssignment[otherMapping[i]] = resultAssignment[i]; - } - result.setAssignmentLogValue(resultAssignment, getAssignmentLogValue(assignment) + other.getAssignmentLogValue(otherAssignment)); - // This mutates the resultAssignment[] array, rather than creating a new one - if (fastPassByReferenceIterator.hasNext()) fastPassByReferenceIterator.next(); - else break; - } + /** + * Just a pass through to the NDArray version, plus a Math.exp to ensure that to the outside world the TableFactor + * doesn't look like it's in log-space + * + * @param assignment a list of variable settings, in the same order as the neighbors array of the factor + * @return the value of the assignment + */ + @Override + public double getAssignmentValue(int[] assignment) { + double d = super.getAssignmentValue(assignment); + // if (d == null) d = Double.NEGATIVE_INFINITY; + return Math.exp(d); } - return result; - } - - /** - * This is useful for calculating the partition function, and is exposed here because when implemented internally - * we can do a much more numerically stable summation. - * - * @return the sum of all values for all assignments to the TableFactor - */ - public double valueSum() { - - // We want the exp(log-sum-exp), for stability - // This rearranges to exp(a)*(sum-exp) - - double max = 0.0; - for (int[] assignment : this) { - double v = getAssignmentLogValue(assignment); - if (v > max) { - max = v; - } + /** + * Just a pass through to the NDArray version, plus a Math.log to ensure that to the outside world the TableFactor + * doesn't look like it's in log-space + * + * @param assignment a list of variable settings, in the same order as the neighbors array of the factor + * @param value the value to put into the factor table + */ + @Override + public void setAssignmentValue(int[] assignment, double value) { + super.setAssignmentValue(assignment, Math.log(value)); } - double sumExp = 0.0; - for (int[] assignment : this) { - sumExp += Math.exp(getAssignmentLogValue(assignment) - max); + //////////////////////////////////////////////////////////////////////////// + // PRIVATE IMPLEMENTATION + //////////////////////////////////////////////////////////////////////////// + + private double getAssignmentLogValue(int[] assignment) { + return super.getAssignmentValue(assignment); } - return sumExp * Math.exp(max); - } - - /** - * Just a pass through to the NDArray version, plus a Math.exp to ensure that to the outside world the TableFactor - * doesn't look like it's in log-space - * - * @param assignment a list of variable settings, in the same order as the neighbors array of the factor - * @return the value of the assignment - */ - @Override - public double getAssignmentValue(int[] assignment) { - double d = super.getAssignmentValue(assignment); - // if (d == null) d = Double.NEGATIVE_INFINITY; - return Math.exp(d); - } - - //////////////////////////////////////////////////////////////////////////// - // PRIVATE IMPLEMENTATION - //////////////////////////////////////////////////////////////////////////// - - /** - * Just a pass through to the NDArray version, plus a Math.log to ensure that to the outside world the TableFactor - * doesn't look like it's in log-space - * - * @param assignment a list of variable settings, in the same order as the neighbors array of the factor - * @param value the value to put into the factor table - */ - @Override - public void setAssignmentValue(int[] assignment, double value) { - super.setAssignmentValue(assignment, Math.log(value)); - } - - private double getAssignmentLogValue(int[] assignment) { - return super.getAssignmentValue(assignment); - } - - private void setAssignmentLogValue(int[] assignment, double value) { - super.setAssignmentValue(assignment, value); - } - - /** - * Marginalizes out a variable by applying an associative join operation for each possible assignment to the - * marginalized variable. - * - * @param variable the variable (by 'name', not offset into neighborIndices) - * @param startingValue associativeJoin is basically a foldr over a table, and this is the initialization - * @param curriedFoldr the associative function to use when applying the join operation, taking first the - * assignment to the value being marginalized, and then a foldr operation - * @return a new TableFactor that doesn't contain 'variable', where values were gotten through associative - * marginalization. - */ - private TableFactor marginalize(int variable, double startingValue, BiFunction> curriedFoldr) { - // Can't marginalize the last variable - assert (getDimensions().length > 1); - - // Calculate the result domain - - List resultDomain = new ArrayList<>(); - for (int n : neighborIndices) { - if (n != variable) { - resultDomain.add(n); - } + private void setAssignmentLogValue(int[] assignment, double value) { + super.setAssignmentValue(assignment, value); } - // Create result TableFactor + /** + * Marginalizes out a variable by applying an associative join operation for each possible assignment to the + * marginalized variable. + * + * @param variable the variable (by 'name', not offset into neighborIndices) + * @param startingValue associativeJoin is basically a foldr over a table, and this is the initialization + * @param curriedFoldr the associative function to use when applying the join operation, taking first the + * assignment to the value being marginalized, and then a foldr operation + * @return a new TableFactor that doesn't contain 'variable', where values were gotten through associative + * marginalization. + */ + private TableFactor marginalize(int variable, double startingValue, BiFunction> curriedFoldr) { + // Can't marginalize the last variable + assert(getDimensions().length > 1); + + // Calculate the result domain + + List resultDomain = new ArrayList<>(); + for (int n : neighborIndices) { + if (n != variable) { + resultDomain.add(n); + } + } - int[] resultNeighborIndices = new int[resultDomain.size()]; - int[] resultDimensions = new int[resultNeighborIndices.length]; - for (int i = 0; i < resultDomain.size(); i++) { - int var = resultDomain.get(i); - resultNeighborIndices[i] = var; - resultDimensions[i] = getVariableSize(var); - } - TableFactor result = new TableFactor(resultNeighborIndices, resultDimensions); + // Create result TableFactor - // Calculate forward-pointers from the old domain to new domain + int[] resultNeighborIndices = new int[resultDomain.size()]; + int[] resultDimensions = new int[resultNeighborIndices.length]; + for (int i = 0; i < resultDomain.size(); i++) { + int var = resultDomain.get(i); + resultNeighborIndices[i] = var; + resultDimensions[i] = getVariableSize(var); + } + TableFactor result = new TableFactor(resultNeighborIndices, resultDimensions); - int[] mapping = new int[neighborIndices.length]; - for (int i = 0; i < neighborIndices.length; i++) { - mapping[i] = resultDomain.indexOf(neighborIndices[i]); - } + // Calculate forward-pointers from the old domain to new domain - // Initialize + int[] mapping = new int[neighborIndices.length]; + for (int i = 0; i < neighborIndices.length; i++) { + mapping[i] = resultDomain.indexOf(neighborIndices[i]); + } - for (int[] assignment : result) { - result.setAssignmentLogValue(assignment, startingValue); - } + // Initialize + + for (int[] assignment : result) { + result.setAssignmentLogValue(assignment, startingValue); + } - // Do the actual fold into the result - - int[] resultAssignment = new int[result.neighborIndices.length]; - int marginalizedVariableValue = 0; - - // OPTIMIZATION: - // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, - // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. - Iterator fastPassByReferenceIterator = fastPassByReferenceIterator(); - int[] assignment = fastPassByReferenceIterator.next(); - while (true) { - // Set the assignment arrays correctly - for (int i = 0; i < assignment.length; i++) { - if (mapping[i] != -1) resultAssignment[mapping[i]] = assignment[i]; - else marginalizedVariableValue = assignment[i]; - } - result.setAssignmentLogValue(resultAssignment, curriedFoldr.apply(marginalizedVariableValue, resultAssignment) - .apply(result.getAssignmentLogValue(resultAssignment), getAssignmentLogValue(assignment))); - if (fastPassByReferenceIterator.hasNext()) fastPassByReferenceIterator.next(); - else break; + // Do the actual fold into the result + + int[] resultAssignment = new int[result.neighborIndices.length]; + int marginalizedVariableValue = 0; + + // OPTIMIZATION: + // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, + // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. + Iterator fastPassByReferenceIterator = fastPassByReferenceIterator(); + int[] assignment = fastPassByReferenceIterator.next(); + while (true) { + // Set the assignment arrays correctly + for (int i = 0; i < assignment.length; i++) { + if (mapping[i] != -1) resultAssignment[mapping[i]] = assignment[i]; + else marginalizedVariableValue = assignment[i]; + } + result.setAssignmentLogValue(resultAssignment, curriedFoldr.apply(marginalizedVariableValue, resultAssignment) + .apply(result.getAssignmentLogValue(resultAssignment), getAssignmentLogValue(assignment))); + if (fastPassByReferenceIterator.hasNext()) fastPassByReferenceIterator.next(); + else break; + } + + return result; } - return result; - } - - /** - * Address a variable by index to get it's size. Basically just a convenience function. - * - * @param variable the name, not index into neighbors, of the variable in question - * @return the size of the factor along this dimension - */ - private int getVariableSize(int variable) { - for (int i = 0; i < neighborIndices.length; i++) { - if (neighborIndices[i] == variable) return getDimensions()[i]; + /** + * Address a variable by index to get it's size. Basically just a convenience function. + * + * @param variable the name, not index into neighbors, of the variable in question + * @return the size of the factor along this dimension + */ + private int getVariableSize(int variable) { + for (int i = 0; i < neighborIndices.length; i++) { + if (neighborIndices[i] == variable) return getDimensions()[i]; + } + return 0; } - return 0; - } - - /** - * Super basic in-place array normalization - * - * @param arr the array to normalize - */ - private void normalizeLogArr(double[] arr) { - // Find the log-scale normalization value - double max = Double.NEGATIVE_INFINITY; - for (double d : arr) { - if (d > max) max = d; + + /** + * Super basic in-place array normalization + * @param arr the array to normalize + */ + private void normalizeLogArr(double[] arr) { + // Find the log-scale normalization value + double max = Double.NEGATIVE_INFINITY; + for (double d : arr) { + if (d > max) max = d; + } + double expSum = 0.0; + for (double d : arr) { + expSum += Math.exp(d-max); + } + double logSumExp = max + Math.log(expSum); + + if (Double.isInfinite(logSumExp)) { + // Just put in uniform probabilities if we are normalizing all 0s + for (int i = 0; i < arr.length; i++) { + arr[i] = 1.0 / arr.length; + } + } + else { + // Normalize in log-scale before exponentiation, to help with stability + for (int i = 0; i < arr.length; i++) { + arr[i] = Math.exp(arr[i] - logSumExp); + } + } } - double expSum = 0.0; - for (double d : arr) { - expSum += Math.exp(d - max); + + /** + * FOR PRIVATE USE AND TESTING ONLY + */ + TableFactor(int[] neighborIndices, int[] dimensions) { + super(dimensions); + this.neighborIndices = neighborIndices; + for (int i = 0; i < values.length; i++) { + values[i] = Double.NEGATIVE_INFINITY; + } } - double logSumExp = max + Math.log(expSum); - - if (Double.isInfinite(logSumExp)) { - // Just put in uniform probabilities if we are normalizing all 0s - for (int i = 0; i < arr.length; i++) { - arr[i] = 1.0 / arr.length; - } - } else { - // Normalize in log-scale before exponentiation, to help with stability - for (int i = 0; i < arr.length; i++) { - arr[i] = Math.exp(arr[i] - logSumExp); - } + + @SuppressWarnings("*") + private boolean assertsEnabled() { + boolean assertsEnabled = false; + assert(assertsEnabled = true); // intentional side effect + return assertsEnabled; } - } - - @SuppressWarnings("*") - private boolean assertsEnabled() { - boolean assertsEnabled = false; - assert (assertsEnabled = true); // intentional side effect - return assertsEnabled; - } } diff --git a/src/edu/stanford/nlp/loglinear/model/ConcatVectorTable.java b/src/edu/stanford/nlp/loglinear/model/ConcatVectorTable.java index fde42ee5d1..35db17144d 100644 --- a/src/edu/stanford/nlp/loglinear/model/ConcatVectorTable.java +++ b/src/edu/stanford/nlp/loglinear/model/ConcatVectorTable.java @@ -15,146 +15,146 @@ * @author keenon */ public class ConcatVectorTable extends NDArray> { - NDArray> originalThunks = null; - - /** - * Constructor takes a list of neighbor variables to use for this factor. This must not change after construction, - * and the number of states of those variables must also not change. - * - * @param dimensions list of neighbor variables assignment range sizes - */ - public ConcatVectorTable(int[] dimensions) { - super(dimensions); - } - - /** - * Convenience function to read a factor (assumed serialized with proto) directly from a stream. - * - * @param stream the stream to be read from - * @return a new in-memory feature factor - * @throws IOException - */ - public static ConcatVectorTable readFromStream(InputStream stream) throws IOException { - return readFromProto(ConcatVectorTableProto.ConcatVectorTable.parseFrom(stream)); - } - - /** - * Creates a new in-memory feature factor from a proto serialization, - * - * @param proto the proto object to be turned into an in-memory feature factor - * @return an in-memory feature factor, complete with in-memory concat vectors - */ - public static ConcatVectorTable readFromProto(ConcatVectorTableProto.ConcatVectorTable proto) { - int[] neighborSizes = new int[proto.getDimensionSizeCount()]; - for (int i = 0; i < neighborSizes.length; i++) { - neighborSizes[i] = proto.getDimensionSize(i); + /** + * Constructor takes a list of neighbor variables to use for this factor. This must not change after construction, + * and the number of states of those variables must also not change. + * + * @param dimensions list of neighbor variables assignment range sizes + */ + public ConcatVectorTable(int[] dimensions) { + super(dimensions); } - ConcatVectorTable factor = new ConcatVectorTable(neighborSizes); - int i = 0; - for (int[] assignment : factor) { - final ConcatVector vector = ConcatVector.readFromProto(proto.getFactorTable(i)); - factor.setAssignmentValue(assignment, () -> vector); - i++; + + /** + * Convenience function to write this factor directly to a stream, encoded as proto. Reversible with readFromStream. + * + * @param stream the stream to write to. does not flush automatically + * @throws IOException + */ + public void writeToStream(OutputStream stream) throws IOException { + getProtoBuilder().build().writeTo(stream); + } + + /** + * Convenience function to read a factor (assumed serialized with proto) directly from a stream. + * + * @param stream the stream to be read from + * @return a new in-memory feature factor + * @throws IOException + */ + public static ConcatVectorTable readFromStream(InputStream stream) throws IOException { + return readFromProto(ConcatVectorTableProto.ConcatVectorTable.parseFrom(stream)); } - return factor; - } - - /** - * Convenience function to write this factor directly to a stream, encoded as proto. Reversible with readFromStream. - * - * @param stream the stream to write to. does not flush automatically - * @throws IOException - */ - public void writeToStream(OutputStream stream) throws IOException { - getProtoBuilder().build().writeTo(stream); - } - - /** - * Returns the proto builder object for this feature factor. Recursively constructs protos for all the concat - * vectors in factorTable. - * - * @return proto Builder object - */ - public ConcatVectorTableProto.ConcatVectorTable.Builder getProtoBuilder() { - ConcatVectorTableProto.ConcatVectorTable.Builder b = ConcatVectorTableProto.ConcatVectorTable.newBuilder(); - for (int n : getDimensions()) { - b.addDimensionSize(n); + + /** + * Returns the proto builder object for this feature factor. Recursively constructs protos for all the concat + * vectors in factorTable. + * + * @return proto Builder object + */ + public ConcatVectorTableProto.ConcatVectorTable.Builder getProtoBuilder() { + ConcatVectorTableProto.ConcatVectorTable.Builder b = ConcatVectorTableProto.ConcatVectorTable.newBuilder(); + for (int n : getDimensions()) { + b.addDimensionSize(n); + } + for (int[] assignment : this) { + b.addFactorTable(getAssignmentValue(assignment).get().getProtoBuilder()); + } + return b; } - for (int[] assignment : this) { - b.addFactorTable(getAssignmentValue(assignment).get().getProtoBuilder()); + + /** + * Creates a new in-memory feature factor from a proto serialization, + * + * @param proto the proto object to be turned into an in-memory feature factor + * @return an in-memory feature factor, complete with in-memory concat vectors + */ + public static ConcatVectorTable readFromProto(ConcatVectorTableProto.ConcatVectorTable proto) { + int[] neighborSizes = new int[proto.getDimensionSizeCount()]; + for (int i = 0; i < neighborSizes.length; i++) { + neighborSizes[i] = proto.getDimensionSize(i); + } + ConcatVectorTable factor = new ConcatVectorTable(neighborSizes); + int i = 0; + for (int[] assignment : factor) { + final ConcatVector vector = ConcatVector.readFromProto(proto.getFactorTable(i)); + factor.setAssignmentValue(assignment, () -> vector); + i++; + } + return factor; } - return b; - } - - /** - * Deep comparison for equality of value, plus tolerance, for every concatvector in the table, plus dimensional - * arrangement. This is mostly useful for testing. - * - * @param other the vector table to compare against - * @param tolerance the tolerance to use in value comparisons - * @return whether the two tables are equivalent by value - */ - public boolean valueEquals(ConcatVectorTable other, double tolerance) { - if (!Arrays.equals(other.getDimensions(), getDimensions())) return false; - for (int[] assignment : this) { - if (!getAssignmentValue(assignment).get().valueEquals(other.getAssignmentValue(assignment).get(), tolerance)) { - return false; - } + + /** + * Deep comparison for equality of value, plus tolerance, for every concatvector in the table, plus dimensional + * arrangement. This is mostly useful for testing. + * + * @param other the vector table to compare against + * @param tolerance the tolerance to use in value comparisons + * @return whether the two tables are equivalent by value + */ + public boolean valueEquals(ConcatVectorTable other, double tolerance) { + if (!Arrays.equals(other.getDimensions(), getDimensions())) return false; + for (int[] assignment : this) { + if (!getAssignmentValue(assignment).get().valueEquals(other.getAssignmentValue(assignment).get(), tolerance)) { + return false; + } + } + return true; } - return true; - } - - /** - * This is an optimization that will fault all the ConcatVectors into memory, and future .get() on the Supplier objs - * will result in a very fast return by reference. Basically this works by wrapping the output of the old thunks - * inside new, thinner closures that carry around the answer in memory. This is a no-op if vectors were already - * cached. - */ - public void cacheVectors() { - if (originalThunks != null) return; - - originalThunks = new NDArray<>(getDimensions()); - - // OPTIMIZATION: - // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, - // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. - Iterator fastPassByReferenceIterator = fastPassByReferenceIterator(); - int[] assignment = fastPassByReferenceIterator.next(); - while (true) { - Supplier originalThunk = getAssignmentValue(assignment); - originalThunks.setAssignmentValue(assignment, originalThunk); - - // Construct a new, thinner closure around the cached value - ConcatVector result = originalThunk.get(); - setAssignmentValue(assignment, () -> result); - - // Set the assignment arrays correctly - if (fastPassByReferenceIterator.hasNext()) fastPassByReferenceIterator.next(); - else break; + + NDArray> originalThunks = null; + + /** + * This is an optimization that will fault all the ConcatVectors into memory, and future .get() on the Supplier objs + * will result in a very fast return by reference. Basically this works by wrapping the output of the old thunks + * inside new, thinner closures that carry around the answer in memory. This is a no-op if vectors were already + * cached. + */ + public void cacheVectors() { + if (originalThunks != null) return; + + originalThunks = new NDArray<>(getDimensions()); + + // OPTIMIZATION: + // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, + // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. + Iterator fastPassByReferenceIterator = fastPassByReferenceIterator(); + int[] assignment = fastPassByReferenceIterator.next(); + while (true) { + Supplier originalThunk = getAssignmentValue(assignment); + originalThunks.setAssignmentValue(assignment, originalThunk); + + // Construct a new, thinner closure around the cached value + ConcatVector result = originalThunk.get(); + setAssignmentValue(assignment, () -> result); + + // Set the assignment arrays correctly + if (fastPassByReferenceIterator.hasNext()) fastPassByReferenceIterator.next(); + else break; + } } - } - - /** - * This will release references to the cached ConcatVectors created by cacheVectors(), so that they can be cleaned - * up by the GC. If no cache was constructed, this is a no-op. - */ - public void releaseCache() { - if (originalThunks != null) { - // OPTIMIZATION: - // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, - // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. - Iterator fastPassByReferenceIterator = fastPassByReferenceIterator(); - int[] assignment = fastPassByReferenceIterator.next(); - while (true) { - setAssignmentValue(assignment, originalThunks.getAssignmentValue(assignment)); - - // Set the assignment arrays correctly - if (fastPassByReferenceIterator.hasNext()) fastPassByReferenceIterator.next(); - else break; - } - // Release our replicated set of original thunks - originalThunks = null; + + /** + * This will release references to the cached ConcatVectors created by cacheVectors(), so that they can be cleaned + * up by the GC. If no cache was constructed, this is a no-op. + */ + public void releaseCache() { + if (originalThunks != null) { + // OPTIMIZATION: + // Rather than use the standard iterator, which creates lots of int[] arrays on the heap, which need to be GC'd, + // we use the fast version that just mutates one array. Since this is read once for us here, this is ideal. + Iterator fastPassByReferenceIterator = fastPassByReferenceIterator(); + int[] assignment = fastPassByReferenceIterator.next(); + while (true) { + setAssignmentValue(assignment, originalThunks.getAssignmentValue(assignment)); + + // Set the assignment arrays correctly + if (fastPassByReferenceIterator.hasNext()) fastPassByReferenceIterator.next(); + else break; + } + // Release our replicated set of original thunks + originalThunks = null; + } } - } } diff --git a/src/edu/stanford/nlp/loglinear/model/NDArrayDoubles.java b/src/edu/stanford/nlp/loglinear/model/NDArrayDoubles.java index e1517627e3..cb692f445d 100644 --- a/src/edu/stanford/nlp/loglinear/model/NDArrayDoubles.java +++ b/src/edu/stanford/nlp/loglinear/model/NDArrayDoubles.java @@ -4,161 +4,157 @@ /** * Created by keenon on 9/12/15. - *

+ * * Holds and provides access to an N-dimensional array. - *

+ * * Yes, generics will lead to unfortunate boxing and unboxing in the TableFactor case, we'll handle that if it becomes a * problem. */ public class NDArrayDoubles implements Iterable { - // public data - protected int[] dimensions; - - // OPTIMIZATION: - // in normal NDArray this is private, but to allow for optimizations we actually leave it as protected - protected double[] values; - - /** - * Constructor takes a list of neighbor variables to use for this factor. This must not change after construction, - * and the number of states of those variables must also not change. - * - * @param dimensions list of neighbor variables assignment range sizes - */ - public NDArrayDoubles(int[] dimensions) { - for (int size : dimensions) { - assert (size > 0); - } - this.dimensions = dimensions; - values = new double[combinatorialNeighborStatesCount()]; - } - - /** - * This is to enable the partially observed constructor for TableFactor. It's an ugly break of modularity, but seems - * to be necessary if we want to keep the constructor for TableFactor with partial observations relatively simple. - */ - protected NDArrayDoubles() { - } - - /** - * Set a single value in the factor table. - * - * @param assignment a list of variable settings, in the same order as the neighbors array of the factor - * @param value the value to put into the factor table - */ - public void setAssignmentValue(int[] assignment, double value) { - values[getTableAccessOffset(assignment)] = value; - } - - /** - * Retrieve a single value for an assignment. - * - * @param assignment a list of variable settings, in the same order as the neighbors array of the factor - * @return the value for the given assignment. Can be null if not been set yet. - */ - public double getAssignmentValue(int[] assignment) { - return values[getTableAccessOffset(assignment)]; - } - - /** - * @return the size array of the neighbors of the feature factor, passed by value to ensure immutability. - */ - public int[] getDimensions() { - return dimensions.clone(); - } - - /** - * WARNING: This is pass by reference to avoid massive GC overload during heavy iterations, and because the standard - * use case is to use the assignments array as an accessor. Please, clone if you save a copy, otherwise the array - * will mutate underneath you. - * - * @return an iterator over all possible assignments to this factor - */ - @Override - public Iterator iterator() { - return new Iterator() { - Iterator unsafe = fastPassByReferenceIterator(); - - @Override - public boolean hasNext() { - return unsafe.hasNext(); - } - - @Override - public int[] next() { - return unsafe.next().clone(); - } - }; - } - - /** - * This is its own function because people will inevitably attempt this optimization of not cloning the array we - * hand to the iterator, to save on GC, and it should not be default behavior. If you know what you're doing, then - * this may be the iterator for you. - * - * @return an iterator that will mutate the value it returns to you, so you must clone if you want to keep a copy - */ - public Iterator fastPassByReferenceIterator() { - final int[] assignments = new int[dimensions.length]; - if (dimensions.length > 0) assignments[0] = -1; - - return new Iterator() { - @Override - public boolean hasNext() { - for (int i = 0; i < assignments.length; i++) { - if (assignments[i] < dimensions[i] - 1) return true; + // public data + protected int[] dimensions; + + // OPTIMIZATION: + // in normal NDArray this is private, but to allow for optimizations we actually leave it as protected + protected double[] values; + + /** + * Constructor takes a list of neighbor variables to use for this factor. This must not change after construction, + * and the number of states of those variables must also not change. + * + * @param dimensions list of neighbor variables assignment range sizes + */ + public NDArrayDoubles(int[] dimensions) { + for (int size : dimensions) { + assert(size > 0); } - return false; - } - - @Override - public int[] next() { - // Add one to the first position - assignments[0]++; - // Carry any resulting overflow all the way to the end. - for (int i = 0; i < assignments.length; i++) { - if (assignments[i] >= dimensions[i]) { - assignments[i] = 0; - if (i < assignments.length - 1) { - assignments[i + 1]++; + this.dimensions = dimensions; + values = new double[combinatorialNeighborStatesCount()]; + } + + /** + * This is to enable the partially observed constructor for TableFactor. It's an ugly break of modularity, but seems + * to be necessary if we want to keep the constructor for TableFactor with partial observations relatively simple. + */ + protected NDArrayDoubles() {} + + /** + * Set a single value in the factor table. + * @param assignment a list of variable settings, in the same order as the neighbors array of the factor + * @param value the value to put into the factor table + */ + public void setAssignmentValue(int[] assignment, double value) { + values[getTableAccessOffset(assignment)] = value; + } + + /** + * Retrieve a single value for an assignment. + * @param assignment a list of variable settings, in the same order as the neighbors array of the factor + * @return the value for the given assignment. Can be null if not been set yet. + */ + public double getAssignmentValue(int[] assignment) { + return values[getTableAccessOffset(assignment)]; + } + + /** + * @return the size array of the neighbors of the feature factor, passed by value to ensure immutability. + */ + public int[] getDimensions() { + return dimensions.clone(); + } + + /** + * WARNING: This is pass by reference to avoid massive GC overload during heavy iterations, and because the standard + * use case is to use the assignments array as an accessor. Please, clone if you save a copy, otherwise the array + * will mutate underneath you. + * + * @return an iterator over all possible assignments to this factor + */ + @Override + public Iterator iterator() { + return new Iterator() { + Iterator unsafe = fastPassByReferenceIterator(); + @Override + public boolean hasNext() { + return unsafe.hasNext(); + } + + @Override + public int[] next() { + return unsafe.next().clone(); + } + }; + } + + /** + * This is its own function because people will inevitably attempt this optimization of not cloning the array we + * hand to the iterator, to save on GC, and it should not be default behavior. If you know what you're doing, then + * this may be the iterator for you. + * + * @return an iterator that will mutate the value it returns to you, so you must clone if you want to keep a copy + */ + public Iterator fastPassByReferenceIterator() { + final int[] assignments = new int[dimensions.length]; + if (dimensions.length > 0) assignments[0] = -1; + + return new Iterator() { + @Override + public boolean hasNext() { + for (int i = 0; i < assignments.length; i++) { + if (assignments[i] < dimensions[i]-1) return true; + } + return false; + } + + @Override + public int[] next() { + // Add one to the first position + assignments[0] ++; + // Carry any resulting overflow all the way to the end. + for (int i = 0; i < assignments.length; i++) { + if (assignments[i] >= dimensions[i]) { + assignments[i] = 0; + if (i < assignments.length-1) { + assignments[i + 1]++; + } + } + else { + break; + } + } + return assignments; } - } else { - break; - } + }; + } + + /** + * @return the total number of states this factor must represent to include all neighbors. + */ + public int combinatorialNeighborStatesCount() { + int c = 1; + for (int n : dimensions) { + c *= n; } - return assignments; - } - }; - } - - /** - * @return the total number of states this factor must represent to include all neighbors. - */ - public int combinatorialNeighborStatesCount() { - int c = 1; - for (int n : dimensions) { - c *= n; + return c; } - return c; - } - - //////////////////////////////////////////////////////////////////////////// - // PRIVATE IMPLEMENTATION - //////////////////////////////////////////////////////////////////////////// - - /** - * Compute the distance into the one dimensional factorTable array that corresponds to a setting of all the - * neighbors of the factor. - * - * @param assignment assignment indices, in same order as neighbors array - * @return the offset index - */ - private int getTableAccessOffset(int[] assignment) { - assert (assignment.length == dimensions.length); - int offset = 0; - for (int i = 0; i < assignment.length; i++) { - assert (assignment[i] < dimensions[i]); - offset = (offset * dimensions[i]) + assignment[i]; + + //////////////////////////////////////////////////////////////////////////// + // PRIVATE IMPLEMENTATION + //////////////////////////////////////////////////////////////////////////// + + /** + * Compute the distance into the one dimensional factorTable array that corresponds to a setting of all the + * neighbors of the factor. + * @param assignment assignment indices, in same order as neighbors array + * @return the offset index + */ + private int getTableAccessOffset(int[] assignment) { + assert(assignment.length == dimensions.length); + int offset = 0; + for (int i = 0; i < assignment.length; i++) { + assert(assignment[i] < dimensions[i]); + offset = (offset*dimensions[i]) + assignment[i]; + } + return offset; } - return offset; - } } diff --git a/src/edu/stanford/nlp/naturalli/ClauseSplitterSearchProblem.java b/src/edu/stanford/nlp/naturalli/ClauseSplitterSearchProblem.java index f71eece7b5..6bc12f44a1 100644 --- a/src/edu/stanford/nlp/naturalli/ClauseSplitterSearchProblem.java +++ b/src/edu/stanford/nlp/naturalli/ClauseSplitterSearchProblem.java @@ -49,7 +49,7 @@ public class ClauseSplitterSearchProblem { /** * A specification for clause splits we _always_ want to do. The format is a map from the edge label we are splitting, to - * the preference for the type of split we should do. The most preferred is at the front of the list, and then it backs off + * the preference for the type of split we should do. The most prefered is at the front of the list, and then it backs off * to the less and less preferred split types. */ protected static final Map> HARD_SPLITS = Collections.unmodifiableMap(new HashMap>() {{ @@ -106,9 +106,6 @@ public class ClauseSplitterSearchProblem { * A mapping from a word to the extra edges that come out of it. */ private final Map> extraEdgesByGovernor = new HashMap<>(); - /** - * A mapping from a word to the extra edges that to into it. - */ private final Map> extraEdgesByDependent = new HashMap<>(); /** * The classifier for whether a particular dependency edge defines a clause boundary. @@ -897,7 +894,7 @@ protected void search( * The default featurizer to use during training. */ public static final Featurizer DEFAULT_FEATURIZER = new Featurizer() { - private static final long serialVersionUID = 4145523451314579506l; + private static final long serialVersionUID = 42L; @Override public boolean isSimpleSplit(Counter feats) { for (String key : feats.keySet()) { diff --git a/src/edu/stanford/nlp/naturalli/RelationTripleSegmenter.java b/src/edu/stanford/nlp/naturalli/RelationTripleSegmenter.java index 506218cbec..2bcff1895c 100644 --- a/src/edu/stanford/nlp/naturalli/RelationTripleSegmenter.java +++ b/src/edu/stanford/nlp/naturalli/RelationTripleSegmenter.java @@ -397,7 +397,6 @@ protected Optional> getValidChunk(SemanticGraph parse, IndexedWo Set validArcs, Optional ignoredArc, boolean allowExtraArcs) { PriorityQueue chunk = new FixedPrioritiesPriorityQueue<>(); - BitSet seenIndices = new BitSet(); Queue fringe = new LinkedList<>(); IndexedWord root = originalRoot; fringe.add(root); @@ -417,13 +416,17 @@ protected Optional> getValidChunk(SemanticGraph parse, IndexedWo while (!fringe.isEmpty()) { root = fringe.poll(); chunk.add(root.backingLabel(), -root.index()); - - // Sanity check to prevent infinite loops - if (seenIndices.get(root.index())) { - // TODO(gabor) Indicates a cycle in the tree! - return Optional.empty(); + for (SemanticGraphEdge edge : parse.incomingEdgeIterable(root)) { + if (edge.getDependent() != originalRoot) { + String relStr = edge.getRelation().toString(); + if ((relStr.startsWith("nmod:") && + !"nmod:poss".equals(relStr) && + !"nmod:npmod".equals(relStr) + ) || + relStr.startsWith("acl:") || relStr.startsWith("advcl:")) { + } + } } - seenIndices.set(root.index()); // Check outgoing edges boolean hasConj = false; @@ -439,7 +442,7 @@ protected Optional> getValidChunk(SemanticGraph parse, IndexedWo } else if (edge.getDependent() == primaryCase) { // noop: ignore case edge } else if (ignoredArc.isPresent() && - (ignoredArc.get().equals(name) || (ignoredArc.get().startsWith("conj") && name.equals("cc")))) { + (ignoredArc.get().equals(name) || ignoredArc.get().startsWith("conj") && name.equals("cc"))) { // noop; ignore explicitly requested noop arc, or "CC" if the noop arc is a conj:* } else if (!validArcs.contains(edge.getRelation().getShortName()) && !validArcs.contains(edge.getRelation().getShortName().replaceAll(":.*",":*"))) { if (!allowExtraArcs) { diff --git a/src/edu/stanford/nlp/pipeline/ChunkAnnotationUtils.java b/src/edu/stanford/nlp/pipeline/ChunkAnnotationUtils.java index 08cc368a83..133b199597 100644 --- a/src/edu/stanford/nlp/pipeline/ChunkAnnotationUtils.java +++ b/src/edu/stanford/nlp/pipeline/ChunkAnnotationUtils.java @@ -1,7 +1,6 @@ package edu.stanford.nlp.pipeline; import edu.stanford.nlp.ling.AnnotationLookup; -import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.process.CoreLabelTokenFactory; @@ -187,11 +186,10 @@ public static boolean fixChunkTokenBoundaries(CoreMap docAnnotation, List chunkList, String origText, - int chunkIndexStart, int chunkIndexEnd, CoreLabelTokenFactory tokenFactory) + int chunkIndexStart, int chunkIndexEnd) { CoreMap firstChunk = chunkList.get(chunkIndexStart); CoreMap lastChunk = chunkList.get(chunkIndexEnd-1); @@ -201,12 +199,7 @@ public static CoreMap getMergedChunk(List chunkList, String o int lastTokenIndex = lastChunk.get(CoreAnnotations.TokenEndAnnotation.class); String chunkText = origText.substring(firstCharOffset, lastCharOffset); - CoreMap newChunk; - if (tokenFactory != null) { - newChunk = tokenFactory.makeToken(chunkText, firstCharOffset, lastCharOffset); - } else { - newChunk = new Annotation(chunkText); - } + CoreMap newChunk = new Annotation(chunkText); newChunk.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, firstCharOffset); newChunk.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, lastCharOffset); @@ -229,20 +222,13 @@ public static CoreMap getMergedChunk(List chunkList, String o * @param chunkIndexStart - Index of first chunk to merge * @param chunkIndexEnd - Index of last chunk to merge (exclusive) * @param aggregators - Aggregators - * @param tokenFactory - factory for creating tokens (if we want to get a merged corelabel instead of something random) * @return new merged chunk */ public static CoreMap getMergedChunk(List chunkList, int chunkIndexStart, int chunkIndexEnd, - Map aggregators, - CoreLabelTokenFactory tokenFactory) + Map aggregators) { - CoreMap newChunk; - if (tokenFactory != null) { - newChunk = tokenFactory.makeToken(); - } else { - newChunk = new Annotation(""); - } + CoreMap newChunk = new Annotation(""); for (Map.Entry entry:aggregators.entrySet()) { if (chunkIndexEnd > chunkList.size()) { assert(false); @@ -250,11 +236,6 @@ public static CoreMap getMergedChunk(List chunkList, Object value = entry.getValue().aggregate(entry.getKey(), chunkList.subList(chunkIndexStart, chunkIndexEnd)); newChunk.set(entry.getKey(), value); } - if (newChunk instanceof CoreLabel) { - CoreLabel cl = (CoreLabel) newChunk; - cl.setValue(cl.word()); - cl.setOriginalText(cl.word()); - } return newChunk; } @@ -300,7 +281,7 @@ public static Interval getChunkOffsetsUsingCharOffsets(List chunkList, String origText, int chunkIndexStart, int chunkIndexEnd) { - CoreMap newChunk = getMergedChunk(chunkList, origText, chunkIndexStart, chunkIndexEnd, null); + CoreMap newChunk = getMergedChunk(chunkList, origText, chunkIndexStart, chunkIndexEnd); int nChunksToRemove = chunkIndexEnd - chunkIndexStart - 1; for (int i = 0; i < nChunksToRemove; i++) { chunkList.remove(chunkIndexStart); @@ -526,26 +507,11 @@ public static String getTokenText(List tokens, Class tokenTex public static String getTokenText(List tokens, Class tokenTextKey, String delimiter) { StringBuilder sb = new StringBuilder(); - int prevEndIndex = -1; - for (CoreMap cm:tokens) { - Object obj = cm.get(tokenTextKey); - boolean includeDelimiter = sb.length() > 0; - if (cm.containsKey(CoreAnnotations.CharacterOffsetBeginAnnotation.class) && - cm.containsKey(CoreAnnotations.CharacterOffsetEndAnnotation.class)) { - int beginIndex = cm.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class); - int endIndex = cm.get(CoreAnnotations.CharacterOffsetEndAnnotation.class); - if (prevEndIndex == beginIndex) { - // No spaces - includeDelimiter = false; - } - prevEndIndex = endIndex; - } - if (obj != null) { - if (includeDelimiter) { - sb.append(delimiter); - } - sb.append(obj); + for (CoreMap t: tokens) { + if (sb.length() != 0) { + sb.append(delimiter); } + sb.append(t.get(tokenTextKey)); } return sb.toString(); } diff --git a/src/edu/stanford/nlp/pipeline/CoreMapAggregator.java b/src/edu/stanford/nlp/pipeline/CoreMapAggregator.java index 7ee8e9e288..1e844068fa 100644 --- a/src/edu/stanford/nlp/pipeline/CoreMapAggregator.java +++ b/src/edu/stanford/nlp/pipeline/CoreMapAggregator.java @@ -1,6 +1,5 @@ package edu.stanford.nlp.pipeline; -import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.util.*; import java.util.ArrayList; @@ -15,35 +14,27 @@ */ public class CoreMapAggregator implements Function, CoreMap> { public static final CoreMapAggregator DEFAULT_AGGREGATOR = getAggregator(CoreMapAttributeAggregator.getDefaultAggregators()); - public static final CoreMapAggregator DEFAULT_NUMERIC_TOKENS_AGGREGATOR = getAggregator(CoreMapAttributeAggregator.DEFAULT_NUMERIC_TOKENS_AGGREGATORS); - Map aggregators; Class mergedKey = null; // Keeps chunks that were merged to form this one - CoreLabelTokenFactory tokenFactory = null; // Should we be creating tokens? public CoreMapAggregator(Map aggregators) { this.aggregators = aggregators; } - public CoreMapAggregator(Map aggregators, Class mergedKey, CoreLabelTokenFactory tokenFactory) { + public CoreMapAggregator(Map aggregators, Class mergedKey) { this.aggregators = aggregators; this.mergedKey = mergedKey; - this.tokenFactory = tokenFactory; } public CoreMap merge(List in, int start, int end) { - CoreMap merged = ChunkAnnotationUtils.getMergedChunk(in, start, end, aggregators, tokenFactory); + CoreMap merged = ChunkAnnotationUtils.getMergedChunk(in, start, end, aggregators); if (mergedKey != null) { merged.set(mergedKey, new ArrayList(in.subList(start, end))); } return merged; } - public CoreMap merge(List in) { - return merge(in, 0, in.size()); - } - public CoreMap apply(List in) { return merge(in, 0, in.size()); } @@ -60,12 +51,7 @@ public static CoreMapAggregator getAggregator(Map aggregators, Class key) { - return new CoreMapAggregator(aggregators, key, null); - } - - public static CoreMapAggregator getAggregator(Map aggregators, Class key, CoreLabelTokenFactory tokenFactory) - { - return new CoreMapAggregator(aggregators, key, tokenFactory); + return new CoreMapAggregator(aggregators, key); } public List merge(List list, List> matched) diff --git a/src/edu/stanford/nlp/pipeline/CoreMapAttributeAggregator.java b/src/edu/stanford/nlp/pipeline/CoreMapAttributeAggregator.java index 176af149ea..4e4a90d9df 100644 --- a/src/edu/stanford/nlp/pipeline/CoreMapAttributeAggregator.java +++ b/src/edu/stanford/nlp/pipeline/CoreMapAttributeAggregator.java @@ -127,6 +127,7 @@ public Object aggregate(Class key, List in) { public static final ConcatCoreMapListAggregator CONCAT_TOKENS = new ConcatCoreMapListAggregator(true); public static final ConcatCoreMapListAggregator CONCAT_COREMAP = new ConcatCoreMapListAggregator(true); + public static final class ConcatAggregator extends CoreMapAttributeAggregator { String delimiter; public ConcatAggregator(String delimiter) @@ -148,20 +149,7 @@ public Object aggregate(Class key, List in) { return sb.toString(); } } - public static final class ConcatTextAggregator extends CoreMapAttributeAggregator { - String delimiter; - public ConcatTextAggregator(String delimiter) - { - this.delimiter = delimiter; - } - public Object aggregate(Class key, List in) { - if (in == null) return null; - String text = ChunkAnnotationUtils.getTokenText(in, key); - return text; - } - } public static final CoreMapAttributeAggregator CONCAT = new ConcatAggregator(" "); - public static final CoreMapAttributeAggregator CONCAT_TEXT = new ConcatTextAggregator(" "); public static final CoreMapAttributeAggregator COUNT = new CoreMapAttributeAggregator() { public Object aggregate(Class key, List in) { return in.size(); @@ -271,7 +259,6 @@ public Object aggregate(Class key, List in) { AGGREGATOR_LOOKUP.put("COUNT", COUNT); AGGREGATOR_LOOKUP.put("SUM", SUM); AGGREGATOR_LOOKUP.put("CONCAT", CONCAT); - AGGREGATOR_LOOKUP.put("CONCAT_TEXT", CONCAT_TEXT); AGGREGATOR_LOOKUP.put("CONCAT_TOKENS", CONCAT_TOKENS); AGGREGATOR_LOOKUP.put("MOST_FREQ", MOST_FREQ); } @@ -282,14 +269,12 @@ public Object aggregate(Class key, List in) { static { Map defaultAggr = new ArrayMap(); - defaultAggr.put(CoreAnnotations.TextAnnotation.class, CoreMapAttributeAggregator.CONCAT_TEXT); + defaultAggr.put(CoreAnnotations.TextAnnotation.class, CoreMapAttributeAggregator.CONCAT); defaultAggr.put(CoreAnnotations.CharacterOffsetBeginAnnotation.class, CoreMapAttributeAggregator.FIRST); defaultAggr.put(CoreAnnotations.CharacterOffsetEndAnnotation.class, CoreMapAttributeAggregator.LAST); defaultAggr.put(CoreAnnotations.TokenBeginAnnotation.class, CoreMapAttributeAggregator.FIRST); defaultAggr.put(CoreAnnotations.TokenEndAnnotation.class, CoreMapAttributeAggregator.LAST); defaultAggr.put(CoreAnnotations.TokensAnnotation.class, CoreMapAttributeAggregator.CONCAT_TOKENS); - defaultAggr.put(CoreAnnotations.BeforeAnnotation.class, CoreMapAttributeAggregator.FIRST); - defaultAggr.put(CoreAnnotations.AfterAnnotation.class, CoreMapAttributeAggregator.LAST); DEFAULT_AGGREGATORS = Collections.unmodifiableMap(defaultAggr); Map defaultNumericAggr = new ArrayMap(DEFAULT_AGGREGATORS); diff --git a/src/edu/stanford/nlp/pipeline/JSONOutputter.java b/src/edu/stanford/nlp/pipeline/JSONOutputter.java index d4c0c1f4f7..79e813189c 100644 --- a/src/edu/stanford/nlp/pipeline/JSONOutputter.java +++ b/src/edu/stanford/nlp/pipeline/JSONOutputter.java @@ -137,8 +137,6 @@ public void print(Annotation doc, OutputStream target, Options options) throws I l3.set("speaker", token.get(CoreAnnotations.SpeakerAnnotation.class)); l3.set("truecase", token.get(CoreAnnotations.TrueCaseAnnotation.class)); l3.set("truecaseText", token.get(CoreAnnotations.TrueCaseTextAnnotation.class)); - l3.set("before", token.get(CoreAnnotations.BeforeAnnotation.class)); - l3.set("after", token.get(CoreAnnotations.AfterAnnotation.class)); // Timex Timex time = token.get(TimeAnnotations.TimexAnnotation.class); if (time != null) { @@ -194,17 +192,17 @@ private static Object buildDependencyTree(SemanticGraph graph) { // Roots graph.getRoots().stream().map( (IndexedWord root) -> (Consumer) dep -> { dep.set("dep", "ROOT"); - dep.set("governor", 0); + dep.set("governor", "0"); dep.set("governorGloss", "ROOT"); - dep.set("dependent", root.index()); + dep.set("dependent", Integer.toString(root.index())); dep.set("dependentGloss", root.word()); }), // Regular edges graph.edgeListSorted().stream().map( (SemanticGraphEdge edge) -> (Consumer) (Writer dep) -> { dep.set("dep", edge.getRelation().toString()); - dep.set("governor", edge.getGovernor().index()); + dep.set("governor", Integer.toString(edge.getGovernor().index())); dep.set("governorGloss", edge.getGovernor().word()); - dep.set("dependent", edge.getDependent().index()); + dep.set("dependent", Integer.toString(edge.getDependent().index())); dep.set("dependentGloss", edge.getDependent().word()); }) ); diff --git a/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializer.java b/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializer.java index b6b45f56f7..7ffb2a185a 100644 --- a/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializer.java +++ b/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializer.java @@ -227,7 +227,7 @@ private static E getAndRegister(CoreMap map, Set> keysToRegister, C * @return A protocol buffer message corresponding to this CoreLabel */ public CoreNLPProtos.Token toProto(CoreLabel coreLabel) { - Set> keysToSerialize = new HashSet<>(coreLabel.keySetNotNull()); + Set> keysToSerialize = new HashSet<>(coreLabel.keySet()); CoreNLPProtos.Token.Builder builder = toProtoBuilder(coreLabel, keysToSerialize); // Completeness check if (enforceLosslessSerialization && !keysToSerialize.isEmpty()) { diff --git a/src/edu/stanford/nlp/pipeline/SentenceAnnotator.java b/src/edu/stanford/nlp/pipeline/SentenceAnnotator.java index 0290da6c9e..1ac5ec696d 100644 --- a/src/edu/stanford/nlp/pipeline/SentenceAnnotator.java +++ b/src/edu/stanford/nlp/pipeline/SentenceAnnotator.java @@ -103,9 +103,6 @@ public void annotate(Annotation annotation) { protected abstract int nThreads(); - /** - * The maximum time to run this annotator for, in milliseconds. - */ protected abstract long maxTime(); /** annotation is included in case there is global information we care about */ diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java index 8207d5badf..e45e762549 100644 --- a/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java +++ b/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java @@ -5,15 +5,9 @@ import com.sun.net.httpserver.HttpServer; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotations; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.ling.IndexedWord; -import edu.stanford.nlp.ling.tokensregex.SequenceMatchResult; -import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher; -import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; -import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; -import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher; -import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern; -import edu.stanford.nlp.util.*; +import edu.stanford.nlp.util.MetaClass; +import edu.stanford.nlp.util.Pair; +import edu.stanford.nlp.util.StringUtils; import java.io.*; import java.math.BigInteger; @@ -21,9 +15,6 @@ import java.net.URI; import java.net.URLDecoder; import java.util.*; -import java.util.concurrent.*; -import java.util.function.Consumer; -import java.util.stream.Collectors; import static edu.stanford.nlp.util.logging.Redwood.Util.*; @@ -44,20 +35,6 @@ public class StanfordCoreNLPServer implements Runnable { public static int HTTP_ERR = 500; public final Properties defaultProps; - /** - * The thread pool for the HTTP server. - */ - private final ExecutorService serverExecutor = Executors.newFixedThreadPool(Execution.threads); - /** - * To prevent grossly wasteful over-creation of pipeline objects, cache the last - * few we created, until the garbage collector decides we can kill them. - */ - private final WeakHashMap pipelineCache = new WeakHashMap<>(); - /** - * An executor to time out CoreNLP execution with. - */ - private final ExecutorService corenlpExecutor = Executors.newFixedThreadPool(Execution.threads); - public StanfordCoreNLPServer(int port) throws IOException { serverPort = port; @@ -83,28 +60,18 @@ public StanfordCoreNLPServer(int port) throws IOException { this.staticPageHandle = new FileHandler("edu/stanford/nlp/pipeline/demo/corenlp-brat.html"); } - /** - * Parse the URL parameters into a map of (key, value) pairs. - * - * @param uri The URL that was requested. - * - * @return A map of (key, value) pairs corresponding to the request parameters. - * - * @throws UnsupportedEncodingException Thrown if we could not decode the URL with utf8. - */ - private static Map getURLParams(URI uri) throws UnsupportedEncodingException { + private static Map getURLParams(URI uri) { if (uri.getQuery() != null) { Map urlParams = new HashMap<>(); String query = uri.getQuery(); - String[] queryFields = query.replace("\\&", "___AMP___").split("&"); + String[] queryFields = query.split("&"); for (String queryField : queryFields) { - queryField = queryField.replace("___AMP___", "&"); - int firstEq = queryField.indexOf('='); + String[] keyValue = queryField.split("="); // Convention uses "+" for spaces. - String key = URLDecoder.decode(queryField.substring(0, firstEq), "utf8"); - String value = URLDecoder.decode(queryField.substring(firstEq + 1), "utf8"); - urlParams.put(key, value); + keyValue[0] = keyValue[0].replace("+", " "); + keyValue[1] = keyValue[1].replace("+", " "); + urlParams.put(keyValue[0], keyValue[1]); } return urlParams; } else { @@ -113,68 +80,7 @@ private static Map getURLParams(URI uri) throws UnsupportedEncod } /** - * Reads the POST contents of the request and parses it into an Annotation object, ready to be annotated. - * This method can also read a serialized document, if the input format is set to be serialized. - * - * @param props The properties we are annotating with. This is where the input format is retrieved from. - * @param httpExchange The exchange we are reading POST data from. - * - * @return An Annotation representing the read document. - * - * @throws IOException Thrown if we cannot read the POST data. - * @throws ClassNotFoundException Thrown if we cannot load the serializer. - */ - private Annotation getDocument(Properties props, HttpExchange httpExchange) throws IOException, ClassNotFoundException { - String inputFormat = props.getProperty("inputFormat", "text"); - switch (inputFormat) { - case "text": - return new Annotation(IOUtils.slurpReader(new InputStreamReader(httpExchange.getRequestBody()))); - case "serialized": - String inputSerializerName = props.getProperty("inputSerializer", ProtobufAnnotationSerializer.class.getName()); - AnnotationSerializer serializer = MetaClass.create(inputSerializerName).createInstance(); - Pair pair = serializer.read(httpExchange.getRequestBody()); - return pair.first; - default: - throw new IOException("Could not parse input format: " + inputFormat); - } - } - - - /** - * Create (or retrieve) a StanfordCoreNLP object corresponding to these properties. - * @param props The properties to create the object with. - * @return A pipeline parameterized by these properties. - */ - private StanfordCoreNLP mkStanfordCoreNLP(Properties props) { - StanfordCoreNLP impl; - synchronized (pipelineCache) { - impl = pipelineCache.get(props); - if (impl == null) { - impl = new StanfordCoreNLP(props); - pipelineCache.put(props, impl); - } - } - return impl; - } - - /** - * A helper function to respond to a request with an error. - * @param response The description of the error to send to the user. * - * @param httpExchange The exchange to send the error over. - * - * @throws IOException Thrown if the HttpExchange cannot communicate the error. - */ - private void respondError(String response, HttpExchange httpExchange) throws IOException { - httpExchange.getResponseHeaders().add("Content-Type", "text/plain"); - httpExchange.sendResponseHeaders(HTTP_ERR, response.length()); - httpExchange.getResponseBody().write(response.getBytes()); - httpExchange.close(); - } - - - /** - * A simple ping test. Responds with pong. */ protected static class PingHandler implements HttpHandler { @Override @@ -188,11 +94,6 @@ public void handle(HttpExchange httpExchange) throws IOException { } } - /** - * Sending the appropriate shutdown key will gracefully shutdown the server. - * This key is, by default, saved into the local file /tmp/corenlp.shutdown on the - * machine the server was run from. - */ protected class ShutdownHandler implements HttpHandler { @Override public void handle(HttpExchange httpExchange) throws IOException { @@ -233,29 +134,42 @@ public void handle(HttpExchange httpExchange) throws IOException { /** * The main handler for taking an annotation request, and annotating it. */ - protected class CoreNLPHandler implements HttpHandler { + protected class SimpleAnnotateHandler implements HttpHandler { /** * The default properties to use in the absence of anything sent by the client. */ public final Properties defaultProps; + /** + * To prevent grossly wasteful over-creation of pipeline objects, cache the last + * few we created, until the garbage collector decides we can kill them. + */ + private final WeakHashMap pipelineCache = new WeakHashMap<>(); /** * Create a handler for accepting annotation requests. * @param props The properties file to use as the default if none were sent by the client. */ - public CoreNLPHandler(Properties props) { + public SimpleAnnotateHandler(Properties props) { defaultProps = props; } /** - * Get the response data type to send to the client, based off of the output format requested from - * CoreNLP. - * - * @param props The properties being used by CoreNLP. - * @param of The output format being output by CoreNLP. - * - * @return An identifier for the type of the HTTP response (e.g., 'text/json'). + * Create (or retrieve) a StanfordCoreNLP object corresponding to these properties. + * @param props The properties to create the object with. + * @return A pipeline parameterized by these properties. */ + private StanfordCoreNLP mkStanfordCoreNLP(Properties props) { + StanfordCoreNLP impl; + synchronized (pipelineCache) { + impl = pipelineCache.get(props); + if (impl == null) { + impl = new StanfordCoreNLP(props); + pipelineCache.put(props, impl); + } + } + return impl; + } + public String getContentType(Properties props, StanfordCoreNLP.OutputFormat of) { switch(of) { case JSON: @@ -311,15 +225,11 @@ public void handle(HttpExchange httpExchange) throws IOException { try { // Annotate StanfordCoreNLP pipeline = mkStanfordCoreNLP(props); - Future completedAnnotationFuture = corenlpExecutor.submit(() -> { - pipeline.annotate(ann); - return ann; - }); - Annotation completedAnnotation = completedAnnotationFuture.get(5, TimeUnit.SECONDS); + pipeline.annotate(ann); // Get output ByteArrayOutputStream os = new ByteArrayOutputStream(); - StanfordCoreNLP.createOutputter(props, AnnotationOutputter.getOptions(pipeline)).accept(completedAnnotation, os); + StanfordCoreNLP.createOutputter(props, AnnotationOutputter.getOptions(pipeline)).accept(ann, os); os.close(); byte[] response = os.toByteArray(); @@ -328,24 +238,17 @@ public void handle(HttpExchange httpExchange) throws IOException { httpExchange.sendResponseHeaders(HTTP_OK, response.length); httpExchange.getResponseBody().write(response); httpExchange.close(); - } catch (TimeoutException e) { - respondError("CoreNLP request timed out", httpExchange); } catch (Exception e) { // Return error message. - respondError(e.getClass().getName() + ": " + e.getMessage(), httpExchange); + e.printStackTrace(); + String response = e.getMessage(); + httpExchange.getResponseHeaders().add("Content-Type", "text/plain"); + httpExchange.sendResponseHeaders(HTTP_ERR, response.length()); + httpExchange.getResponseBody().write(response.getBytes()); + httpExchange.close(); } } - /** - * Parse the parameters of a connection into a CoreNLP properties file that can be passed into - * {@link StanfordCoreNLP}, and used in the I/O stages. - * - * @param httpExchange The http exchange; effectively, the request information. - * - * @return A {@link Properties} object corresponding to a combination of default and passed properties. - * - * @throws UnsupportedEncodingException Thrown if we could not decode the key/value pairs with UTF-8. - */ private Properties getProperties(HttpExchange httpExchange) throws UnsupportedEncodingException { // Load the default properties Properties props = new Properties(); @@ -367,226 +270,32 @@ private Properties getProperties(HttpExchange httpExchange) throws UnsupportedEn return props; } - } - - - - /** - * A handler for matching TokensRegex patterns against text. - */ - protected class TokensRegexHandler implements HttpHandler { - - @Override - public void handle(HttpExchange httpExchange) throws IOException { - // Set common response headers - httpExchange.getResponseHeaders().add("Access-Control-Allow-Origin", "*"); - - Future json = corenlpExecutor.submit(() -> { - try { - // Get the document - Properties props = new Properties() {{ - setProperty("annotators", "tokenize,ssplit,pos,lemma,ner"); - }}; - Annotation doc = getDocument(props, httpExchange); - if (!doc.containsKey(CoreAnnotations.SentencesAnnotation.class)) { - StanfordCoreNLP pipeline = mkStanfordCoreNLP(props); - pipeline.annotate(doc); - } - - // Construct the matcher - Map params = getURLParams(httpExchange.getRequestURI()); - // (get the pattern) - if (!params.containsKey("pattern")) { - respondError("Missing required parameter 'pattern'", httpExchange); - return ""; - } - String pattern = params.get("pattern"); - // (get whether to filter / find) - String filterStr = params.getOrDefault("filter", "false"); - final boolean filter = filterStr.trim().isEmpty() || "true".equalsIgnoreCase(filterStr.toLowerCase()); - // (create the matcher) - final TokenSequencePattern regex = TokenSequencePattern.compile(pattern); - - // Run TokensRegex - return JSONOutputter.JSONWriter.objectToJSON((docWriter) -> { - if (filter) { - // Case: just filter sentences - docWriter.set("sentences", doc.get(CoreAnnotations.SentencesAnnotation.class).stream().map(sentence -> - regex.matcher(sentence.get(CoreAnnotations.TokensAnnotation.class)).matches() - ).collect(Collectors.toList())); - } else { - // Case: find matches - docWriter.set("sentences", doc.get(CoreAnnotations.SentencesAnnotation.class).stream().map(sentence -> (Consumer) (JSONOutputter.Writer sentWriter) -> { - List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); - TokenSequenceMatcher matcher = regex.matcher(tokens); - int i = 0; - while (matcher.find()) { - sentWriter.set(Integer.toString(i), (Consumer) (JSONOutputter.Writer matchWriter) -> { - matchWriter.set("text", matcher.group()); - matchWriter.set("begin", matcher.start()); - matchWriter.set("end", matcher.end()); - for (int groupI = 0; groupI < matcher.groupCount(); ++groupI) { - SequenceMatchResult.MatchedGroupInfo info = matcher.groupInfo(groupI + 1); - matchWriter.set(info.varName == null ? Integer.toString(groupI + 1) : info.varName, (Consumer) groupWriter -> { - groupWriter.set("text", info.text); - if (info.nodes.size() > 0) { - groupWriter.set("begin", info.nodes.get(0).get(CoreAnnotations.IndexAnnotation.class) - 1); - groupWriter.set("end", info.nodes.get(info.nodes.size() - 1).get(CoreAnnotations.IndexAnnotation.class)); - } - }); - } - }); - i += 1; - } - sentWriter.set("length", i); - })); - } - }); - } catch (Exception e) { - e.printStackTrace(); - try { - respondError(e.getClass().getName() + ": " + e.getMessage(), httpExchange); - } catch (IOException ignored) { - } - } - return ""; - }); - - // Send response - byte[] response = new byte[0]; - try { - response = json.get(5, TimeUnit.SECONDS).getBytes(); - } catch (InterruptedException | ExecutionException | TimeoutException e) { - respondError("Timeout when executing TokensRegex query", httpExchange); - } - if (response.length > 0) { - httpExchange.getResponseHeaders().add("Content-Type", "text/json"); - httpExchange.getResponseHeaders().add("Content-Length", Integer.toString(response.length)); - httpExchange.sendResponseHeaders(HTTP_OK, response.length); - httpExchange.getResponseBody().write(response); - httpExchange.close(); - } - } - } - - - - /** - * A handler for matching semgrex patterns against dependency trees. - */ - protected class SemgrexHandler implements HttpHandler { - - @Override - public void handle(HttpExchange httpExchange) throws IOException { - // Set common response headers - httpExchange.getResponseHeaders().add("Access-Control-Allow-Origin", "*"); - - Future json = corenlpExecutor.submit(() -> { - try { - // Get the document - Properties props = new Properties() {{ - setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,depparse"); - }}; - Annotation doc = getDocument(props, httpExchange); - if (!doc.containsKey(CoreAnnotations.SentencesAnnotation.class)) { - StanfordCoreNLP pipeline = mkStanfordCoreNLP(props); - pipeline.annotate(doc); - } - - // Construct the matcher - Map params = getURLParams(httpExchange.getRequestURI()); - // (get the pattern) - if (!params.containsKey("pattern")) { - respondError("Missing required parameter 'pattern'", httpExchange); - return ""; - } - String pattern = params.get("pattern"); - // (get whether to filter / find) - String filterStr = params.getOrDefault("filter", "false"); - final boolean filter = filterStr.trim().isEmpty() || "true".equalsIgnoreCase(filterStr.toLowerCase()); - // (create the matcher) - final SemgrexPattern regex = SemgrexPattern.compile(pattern); - - // Run TokensRegex - return JSONOutputter.JSONWriter.objectToJSON((docWriter) -> { - if (filter) { - // Case: just filter sentences - docWriter.set("sentences", doc.get(CoreAnnotations.SentencesAnnotation.class).stream().map(sentence -> - regex.matcher(sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class)).matches() - ).collect(Collectors.toList())); - } else { - // Case: find matches - docWriter.set("sentences", doc.get(CoreAnnotations.SentencesAnnotation.class).stream().map(sentence -> (Consumer) (JSONOutputter.Writer sentWriter) -> { - SemgrexMatcher matcher = regex.matcher(sentence.get(SemanticGraphCoreAnnotations.CollapsedCCProcessedDependenciesAnnotation.class)); - int i = 0; - while (matcher.find()) { - sentWriter.set(Integer.toString(i), (Consumer) (JSONOutputter.Writer matchWriter) -> { - IndexedWord match = matcher.getMatch(); - matchWriter.set("text", match.word()); - matchWriter.set("begin", match.index() - 1); - matchWriter.set("end", match.index()); - for (String capture : matcher.getNodeNames()) { - matchWriter.set("$" + capture, (Consumer) groupWriter -> { - IndexedWord node = matcher.getNode(capture); - groupWriter.set("text", node.word()); - groupWriter.set("begin", node.index() - 1); - groupWriter.set("end", node.index()); - }); - } - }); - i += 1; - } - sentWriter.set("length", i); - })); - } - }); - } catch (Exception e) { - e.printStackTrace(); - try { - respondError(e.getClass().getName() + ": " + e.getMessage(), httpExchange); - } catch (IOException ignored) { - } - } - return ""; - }); - // Send response - byte[] response = new byte[0]; - try { - response = json.get(5, TimeUnit.SECONDS).getBytes(); - } catch (InterruptedException | ExecutionException | TimeoutException e) { - respondError("Timeout when executing Semgrex query", httpExchange); - } - if (response.length > 0) { - httpExchange.getResponseHeaders().add("Content-Type", "text/json"); - httpExchange.getResponseHeaders().add("Content-Length", Integer.toString(response.length)); - httpExchange.sendResponseHeaders(HTTP_OK, response.length); - httpExchange.getResponseBody().write(response); - httpExchange.close(); + private Annotation getDocument(Properties props, HttpExchange httpExchange) throws IOException, ClassNotFoundException { + String inputFormat = props.getProperty("inputFormat"); + switch (inputFormat) { + case "text": + return new Annotation(IOUtils.slurpReader(new InputStreamReader(httpExchange.getRequestBody()))); + case "serialized": + String inputSerializerName = props.getProperty("inputSerializer"); + AnnotationSerializer serializer = MetaClass.create(inputSerializerName).createInstance(); + Pair pair = serializer.read(httpExchange.getRequestBody()); + return pair.first; + default: + throw new IOException("Could not parse input format: " + inputFormat); } } } - - - - - /** - * Run the server. - * This method registers the handlers, and initializes the HTTP server. - */ @Override public void run() { try { server = HttpServer.create(new InetSocketAddress(serverPort), 0); // 0 is the default 'backlog' - server.createContext("/", new CoreNLPHandler(defaultProps)); - server.createContext("/tokensregex", new TokensRegexHandler()); - server.createContext("/semgrex", new SemgrexHandler()); + server.createContext("/", new SimpleAnnotateHandler(defaultProps)); server.createContext("/corenlp-brat.js", new FileHandler("edu/stanford/nlp/pipeline/demo/corenlp-brat.js")); server.createContext("/corenlp-brat.cs", new FileHandler("edu/stanford/nlp/pipeline/demo/corenlp-brat.css")); server.createContext("/ping", new PingHandler()); server.createContext("/shutdown", new ShutdownHandler()); - server.setExecutor(serverExecutor); server.start(); log("StanfordCoreNLPServer listening at " + server.getAddress()); } catch (IOException e) { @@ -594,14 +303,6 @@ public void run() { } } - /** - * The main method. - * Read the command line arguments and run the server. - * - * @param args The command line arguments - * - * @throws IOException Thrown if we could not start / run the server. - */ public static void main(String[] args) throws IOException { int port = DEFAULT_PORT; if(args.length > 0) { diff --git a/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotator.java b/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotator.java index fe24e14d66..2a3909b6d8 100644 --- a/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotator.java +++ b/src/edu/stanford/nlp/pipeline/TokensRegexNERAnnotator.java @@ -28,17 +28,11 @@ * * where each argument is tab-separated, and the last two arguments are optional. Several regexes can be * associated with a single type. In the case where multiple regexes match a phrase, the priority ranking - * (higher priority is favored) is used to choose between the possible types. - * When the priority is the same, then longer matches are favored. - * - * This annotator is designed to be used as part of a full + * is used to choose between the possible types. When the priority is the same, then longer matches are favored. + * This classifier is designed to be used as part of a full * NER system to label entities that don't fall into the usual NER categories. It only records the label * if the token has not already been NER-annotated, or it has been annotated but the NER-type has been * designated overwritable (the third argument). - * - * It is also possible to use this annotator to annotate fields other than the - * NamedEntityTagAnnotation field by - * and providing the header *

* *

@@ -70,7 +64,6 @@ *

* Main differences from {@link RegexNERAnnotator}: *

    - *
  • Supports annotation of fields other than the NamedEntityTagAnnotation field
  • *
  • Supports both TokensRegex patterns and patterns over the text of the tokens
  • *
  • When NER annotation can be overwritten based on the original NER labels. The rules for when the new NER labels are used * are given below: @@ -91,13 +84,6 @@ * mappingComma separated list of mapping files to use * edu/stanford/nlp/models/regexner/type_map_clean * - * mapping.header - * Comma separated list of header fields (or true if header is specified in the file) - * pattern,ner,overwrite,priority,group - * mapping.field.<fieldname> - * Class mapping for annotation fields other than ner - * commonWords - * Comma separated list of files for common words to not annotate (in case your mapping isn't very clean) * backgroundSymbolComma separated list of NER labels to always replace * O,MISC * posmatchtype @@ -127,21 +113,11 @@ */ public class TokensRegexNERAnnotator implements Annotator { protected static final Redwood.RedwoodChannels logger = Redwood.channels("TokenRegexNER"); - protected static final String PATTERN_FIELD = "pattern"; - protected static final String OVERWRITE_FIELD = "overwrite"; - protected static final String PRIORITY_FIELD = "priority"; - protected static final String WEIGHT_FIELD = "weight"; - protected static final String GROUP_FIELD = "group"; - - protected static final Set predefinedHeaderFields = CollectionUtils.asSet(new String[]{PATTERN_FIELD, OVERWRITE_FIELD, PRIORITY_FIELD, WEIGHT_FIELD, GROUP_FIELD}); - protected static final String defaultHeader = "pattern,ner,overwrite,priority,group"; private final boolean ignoreCase; - private final Set commonWords; private final List entries; private final Map,Entry> patternToEntry; private final MultiPatternMatcher multiPatternMatcher; - private final List annotationFields; // list of fields to annotate (default to just NamedEntityTag) private final Set myLabels; // set of labels to always overwrite private final Pattern validPosPattern; @@ -163,9 +139,6 @@ enum PosMatchType { public static PropertiesUtils.Property[] SUPPORTED_PROPERTIES = new PropertiesUtils.Property[]{ new PropertiesUtils.Property("mapping", DefaultPaths.DEFAULT_REGEXNER_RULES, "Comma separated list of mapping files to use."), - new PropertiesUtils.Property("mapping.header", defaultHeader, "Comma separated list specifying order of fields in the mapping file"), - new PropertiesUtils.Property("mapping.field.", "", "Class mapping for annotation fields other than ner"), - new PropertiesUtils.Property("commonWords", "", "Comma separated list of files for common words to not annotate (in case your mapping isn't very clean)"), new PropertiesUtils.Property("ignorecase", "false", "Whether to ignore case or not when matching patterns."), new PropertiesUtils.Property("validpospattern", "", "Regular expression pattern for matching POS tags."), new PropertiesUtils.Property("posmatchtype", DEFAULT_POS_MATCH_TYPE.name(), "How should 'validpospattern' be used to match the POS of the tokens."), @@ -203,67 +176,15 @@ private static Properties getProperties(String name, String mapping, boolean ign return props; } - private static Pattern FILE_DELIMITERS_PATTERN = Pattern.compile("\\s*[,;]\\s*"); - private static Pattern COMMA_DELIMITERS_PATTERN = Pattern.compile("\\s*,\\s*"); public TokensRegexNERAnnotator(String name, Properties properties) { String prefix = (name != null && !name.isEmpty())? name + ".":""; String backgroundSymbol = properties.getProperty(prefix + "backgroundSymbol", DEFAULT_BACKGROUND_SYMBOL); - String[] backgroundSymbols = COMMA_DELIMITERS_PATTERN.split(backgroundSymbol); + String[] backgroundSymbols = backgroundSymbol.split("\\s*,\\s*"); String mappingFiles = properties.getProperty(prefix + "mapping", DefaultPaths.DEFAULT_REGEXNER_RULES); - String[] mappings = FILE_DELIMITERS_PATTERN.split(mappingFiles); + String[] mappings = mappingFiles.split("\\s*[,;]\\s*"); String validPosRegex = properties.getProperty(prefix + "validpospattern"); this.posMatchType = PosMatchType.valueOf(properties.getProperty(prefix + "posmatchtype", DEFAULT_POS_MATCH_TYPE.name())); - String commonWordsFile = properties.getProperty(prefix + "commonWords"); - commonWords = new HashSet(); - if (commonWordsFile != null) { - try { - BufferedReader reader = IOUtils.getBufferedFileReader(commonWordsFile); - String line; - while ((line = reader.readLine()) != null) { - commonWords.add(line); - } - reader.close(); - } catch (IOException ex) { - throw new RuntimeException("TokensRegexNERAnnotator " + name - + ": Error opening the common words file: " + commonWordsFile, ex); - } - } - - String headerProp = properties.getProperty(prefix + "mapping.header", defaultHeader); - boolean readHeaderFromFile = headerProp.equalsIgnoreCase("true"); - String[] annotationFieldnames = null; - String[] headerFields = null; - if (readHeaderFromFile) { - // Get header as first line from all files... - // TODO: support reading header from file - throw new UnsupportedOperationException("Reading header from file not yet supported!!!"); - } else { - headerFields = COMMA_DELIMITERS_PATTERN.split(headerProp); - // Take header fields and remove known headers to get annotation field names - List fieldNames = new ArrayList(); - List fieldClasses = new ArrayList(); - for (int i = 0; i < headerFields.length; i++) { - String field = headerFields[i]; - if (!predefinedHeaderFields.contains(field)) { - Class fieldClass = EnvLookup.lookupAnnotationKeyWithClassname(null, field); - if (fieldClass == null) { - // check our properties - String classname = properties.getProperty(prefix + "mapping.field." + field); - fieldClass = EnvLookup.lookupAnnotationKeyWithClassname(null, classname); - } - if (fieldClass != null) { - fieldNames.add(field); - fieldClasses.add(fieldClass); - } else { - logger.warn("TokensRegexNERAnnotator " + name + ": Unknown field: " + field + " cannot find suitable annotation class"); - } - } - } - annotationFieldnames = new String[fieldNames.size()]; - fieldNames.toArray(annotationFieldnames); - annotationFields = fieldClasses; - } String noDefaultOverwriteLabelsProp = properties.getProperty(prefix + "noDefaultOverwriteLabels"); this.noDefaultOverwriteLabels = (noDefaultOverwriteLabelsProp != null) @@ -277,7 +198,7 @@ public TokensRegexNERAnnotator(String name, Properties properties) { } else { validPosPattern = null; } - entries = Collections.unmodifiableList(readEntries(name, noDefaultOverwriteLabels, ignoreCase, verbose, headerFields, annotationFieldnames, mappings)); + entries = Collections.unmodifiableList(readEntries(name, noDefaultOverwriteLabels, ignoreCase, verbose, mappings)); IdentityHashMap, Entry> patternToEntry = new IdentityHashMap, Entry>(); multiPatternMatcher = createPatternMatcher(patternToEntry); this.patternToEntry = Collections.unmodifiableMap(patternToEntry); @@ -286,11 +207,7 @@ public TokensRegexNERAnnotator(String name, Properties properties) { Collections.addAll(myLabels, backgroundSymbols); myLabels.add(null); // Always overwrite labels - for (Entry entry: entries) { - for (String type:entry.types) { - myLabels.add(type); - } - } + for (Entry entry: entries) myLabels.add(entry.type); this.myLabels = Collections.unmodifiableSet(myLabels); } @@ -350,7 +267,6 @@ private MultiPatternMatcher createPatternMatcher(Map tokens) { int start = m.start(g); int end = m.end(g); - String str = m.group(g); - if (commonWords.contains(str)) { - if (verbose) { - System.err.println("Not annotating (common word) '" + str + "': " + - StringUtils.joinFields(m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class) - + " with " + entry.getTypeDescription() + ", sentence is '" + StringUtils.joinWords(tokens, " ") + "'"); - } - continue; - } - boolean overwriteOriginalNer = checkPosTags(tokens, start, end); if (overwriteOriginalNer) { overwriteOriginalNer = checkOrigNerTags(entry, tokens, start, end); } if (overwriteOriginalNer) { for (int i = start; i < end; i++) { - CoreLabel token = tokens.get(i); - for (int j = 0; j < annotationFields.size(); j++) { - token.set(annotationFields.get(j), entry.types[j]); - } - // tokens.get(i).set(CoreAnnotations.NamedEntityTagAnnotation.class, entry.type); + tokens.get(i).set(CoreAnnotations.NamedEntityTagAnnotation.class, entry.type); } } else { if (verbose) { System.err.println("Not annotating '" + m.group(g) + "': " + StringUtils.joinFields(m.groupNodes(g), CoreAnnotations.NamedEntityTagAnnotation.class) - + " with " + entry.getTypeDescription() + ", sentence is '" + StringUtils.joinWords(tokens, " ") + "'"); + + " with " + entry.type + ", sentence is '" + StringUtils.joinWords(tokens, " ") + "'"); } } } @@ -482,7 +384,7 @@ private boolean checkOrigNerTags(Entry entry, List tokens, int start, } else { // if this ner type doesn't belong to the labels for which we don't overwrite the default labels (noDefaultOverwriteLabels) // we check mylabels to see if we can overwrite this entry - if (/*entry.overwritableTypes.isEmpty() || */!hasNoOverwritableType(noDefaultOverwriteLabels, entry.types)) { + if (/*entry.overwritableTypes.isEmpty() || */!noDefaultOverwriteLabels.contains(entry.type)) { overwriteOriginalNer = myLabels.contains(startNer); } } @@ -495,33 +397,22 @@ private boolean checkOrigNerTags(Entry entry, List tokens, int start, private static class Entry { public final String tokensRegex; public final String[] regex; // the regex, tokenized by splitting on white space - public final String[] types; // the associated types + public final String type; // the associated type public final Set overwritableTypes; // what types can be overwritten by this entry public final double priority; - public final double weight; public final int annotateGroup; - public Entry(String tokensRegex, String[] regex, String[] types, Set overwritableTypes, double priority, double weight, int annotateGroup) { + public Entry(String tokensRegex, String[] regex, String type, Set overwritableTypes, double priority, int annotateGroup) { this.tokensRegex = tokensRegex; this.regex = regex; - this.types = new String[types.length]; - for (int i = 0; i < types.length; i++) { - // TODO: for some types, it doesn't make sense to be interning... - this.types[i] = types[i].intern(); - } + this.type = type.intern(); this.overwritableTypes = overwritableTypes; this.priority = priority; - this.weight = weight; this.annotateGroup = annotateGroup; } - public String getTypeDescription() { - return "[" + StringUtils.join(types, ",") + "]"; - } - public String toString() { - return "Entry{" + ((tokensRegex != null) ? tokensRegex: StringUtils.join(regex)) + ' ' - + StringUtils.join(types) + ' ' + overwritableTypes + ' ' + priority + '}'; + return "Entry{" + ((tokensRegex != null) ? tokensRegex: StringUtils.join(regex)) + ' ' + type + ' ' + overwritableTypes + ' ' + priority + '}'; } } @@ -534,8 +425,6 @@ public String toString() { private static List readEntries(String annotatorName, Set noDefaultOverwriteLabels, boolean ignoreCase, boolean verbose, - String[] headerFields, - String[] annotationFieldnames, String... mappings) { // Unlike RegexNERClassifier, we don't bother sorting the entries // We leave it to TokensRegex NER to sort out the priorities and matches @@ -548,7 +437,7 @@ private static List readEntries(String annotatorName, BufferedReader rd = null; try { rd = IOUtils.readerFromString(mapping); - readEntries(annotatorName, headerFields, annotationFieldnames, entries, seenRegexes, mapping, rd, noDefaultOverwriteLabels, ignoreCase, verbose); + readEntries(annotatorName, entries, seenRegexes, mapping, rd, noDefaultOverwriteLabels, ignoreCase, verbose); } catch (IOException e) { throw new RuntimeIOException("Couldn't read TokensRegexNER from " + mapping, e); } finally { @@ -563,24 +452,6 @@ private static List readEntries(String annotatorName, return entries; } - private static Map getHeaderIndexMap(String[] headerFields) { - Map map = new HashMap(); - for (int i = 0; i < headerFields.length; i++) { - String field = headerFields[i]; - if (map.containsKey(field)) { - throw new IllegalArgumentException("Duplicate header field: " + field); - } - map.put(field,i); - } - return map; - } - - - private static int getIndex(Map map, String name) { - Integer index = map.get(name); - if (index == null) return -1; - else return index; - } /** * Reads a list of Entries from a mapping file and update the given entries. * Line numbers start from 1. @@ -588,8 +459,6 @@ private static int getIndex(Map map, String name) { * @return the updated list of Entries */ private static List readEntries(String annotatorName, - String[] headerFields, - String[] annotationFieldnames, List entries, TrieMap seenRegexes, String mappingFilename, @@ -599,39 +468,13 @@ private static List readEntries(String annotatorName, int origEntriesSize = entries.size(); int isTokensRegex = 0; int lineCount = 0; - Map headerIndexMap = getHeaderIndexMap(headerFields); - int iPattern = getIndex(headerIndexMap, PATTERN_FIELD); - if (iPattern < 0) { - throw new IllegalArgumentException("TokensRegexNERAnnotator " + annotatorName - + " ERROR: Header does not contain 'pattern': " + StringUtils.join(headerFields)); - } - int iOverwrite = getIndex(headerIndexMap, OVERWRITE_FIELD); - int iPriority = getIndex(headerIndexMap, PRIORITY_FIELD); - int iWeight = getIndex(headerIndexMap, WEIGHT_FIELD); - int iGroup = getIndex(headerIndexMap, GROUP_FIELD); - int[] annotationCols = new int[annotationFieldnames.length]; - int iLastAnnotationField = -1; - for (int i = 0; i < annotationFieldnames.length; i++) { - annotationCols[i] = getIndex(headerIndexMap, annotationFieldnames[i]); - if (annotationCols[i] < 0) { - throw new IllegalArgumentException("TokensRegexNERAnnotator " + annotatorName - + " ERROR: Header does not contain annotation field '" + annotationFieldnames[i] + "': " + StringUtils.join(headerFields)); - } - if (annotationCols[i] > iLastAnnotationField) { - iLastAnnotationField = annotationCols[i]; - } - } - - int minFields = Math.min(iPattern, iLastAnnotationField); // Take minimum of "pattern" and last annotation field - int maxFields = headerFields.length; // Take maximum number of headerFields for (String line; (line = mapping.readLine()) != null; ) { lineCount ++; String[] split = line.split("\t"); - if (split.length < minFields || split.length > maxFields) { - throw new IllegalArgumentException("TokensRegexNERAnnotator " + annotatorName - + " ERROR: Provided mapping file is in wrong format. Line " + lineCount + " is bad: " + line); + if (split.length < 2 || split.length > 5) { + throw new IllegalArgumentException("Provided mapping file is in wrong format. This line is bad: " + line); } - String regex = split[iPattern].trim(); + String regex = split[0].trim(); String tokensRegex = null; String[] regexes = null; if (regex.startsWith("( ") && regex.endsWith(" )")) { @@ -648,66 +491,46 @@ private static List readEntries(String annotatorName, } key = norm; } - String[] types = new String[annotationCols.length]; - for (int i=0; i < annotationCols.length; i++) { - types[i] = split[annotationCols[i]].trim(); - } + String type = split[1].trim(); Set overwritableTypes = Generics.newHashSet(); double priority = 0.0; - if (iOverwrite >= 0 && split.length > iOverwrite) { - overwritableTypes.addAll(Arrays.asList(split[iOverwrite].trim().split("\\s*,\\s*"))); - } - if (iPriority >= 0 && split.length > iPriority) { - try { - priority = Double.parseDouble(split[iPriority].trim()); - } catch (NumberFormatException e) { - throw new IllegalArgumentException("TokensRegexNERAnnotator " + annotatorName - + " ERROR: Invalid priority in line " + lineCount - + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); - } + if (split.length >= 3) { + overwritableTypes.addAll(Arrays.asList(split[2].trim().split("\\s*,\\s*"))); } - - double weight = 0.0; - if (iWeight >= 0 && split.length > iWeight) { + if (split.length >= 4) { try { - weight = Double.parseDouble(split[iWeight].trim()); + priority = Double.parseDouble(split[3].trim()); } catch (NumberFormatException e) { - throw new IllegalArgumentException("TokensRegexNERAnnotator " + annotatorName - + " ERROR: Invalid weight in line " + lineCount - + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); + throw new IllegalArgumentException("ERROR: Invalid priority in line " + lineCount + + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); } } int annotateGroup = 0; // Get annotate group from input.... - if (iGroup>= 0 && split.length > iGroup) { + if (split.length >= 5) { // Which group to take (allow for context) - String context = split[iGroup].trim(); + String context = split[4].trim(); try { annotateGroup = Integer.parseInt(context); } catch (NumberFormatException e) { - throw new IllegalArgumentException("TokensRegexNERAnnotator " + annotatorName - + " ERROR: Invalid group in line " + lineCount - + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); + throw new IllegalArgumentException("ERROR: Invalid group in line " + lineCount + + " in regexner file " + mappingFilename + ": \"" + line + "\"!", e); } } - // Print some warnings about the type - for (int i = 0; i < types.length; i++) { - String type = types[i]; - // TODO: Have option to allow commas in types - int commaPos = type.indexOf(','); - if (commaPos > 0) { - // Strip the "," and just take first type - String newType = type.substring(0, commaPos).trim(); - logger.warn("TokensRegexNERAnnotator " + annotatorName + - ": Entry has multiple types for " + annotationFieldnames[i] + ": " + line + ". Taking type to be " + newType); - types[i] = newType; - } + // Print some warning about the type + int commaPos = type.indexOf(','); + if (commaPos > 0) { + // Strip the "," and just take first type + String newType = type.substring(0, commaPos).trim(); + logger.warn("TokensRegexNERAnnotator " + annotatorName + + ": Entry has multiple types: " + line + ". Taking type to be " + newType); + type = newType; } - Entry entry = new Entry(tokensRegex, regexes, types, overwritableTypes, priority, weight, annotateGroup); + Entry entry = new Entry(tokensRegex, regexes, type, overwritableTypes, priority, annotateGroup); if (seenRegexes.containsKey(key)) { Entry oldEntry = seenRegexes.get(key); @@ -715,12 +538,10 @@ private static List readEntries(String annotatorName, logger.warn("TokensRegexNERAnnotator " + annotatorName + ": Replace duplicate entry (higher priority): old=" + oldEntry + ", new=" + entry); } else { - String oldTypeDesc = oldEntry.getTypeDescription(); - String newTypeDesc = entry.getTypeDescription(); - if (!oldTypeDesc.equals(newTypeDesc)) { + if (!oldEntry.type.equals(type)) { if (verbose) { logger.warn("TokensRegexNERAnnotator " + annotatorName + - ": Ignoring duplicate entry: " + split[0] + ", old type = " + oldTypeDesc + ", new type = " + newTypeDesc); + ": Ignoring duplicate entry: " + split[0] + ", old type = " + oldEntry.type + ", new type = " + type); } // } else { // if (verbose) { @@ -733,7 +554,7 @@ private static List readEntries(String annotatorName, } // Print some warning if label belongs to noDefaultOverwriteLabels but there is no overwritable types - if (entry.overwritableTypes.isEmpty() && hasNoOverwritableType(noDefaultOverwriteLabels, entry.types)) { + if (entry.overwritableTypes.isEmpty() && noDefaultOverwriteLabels.contains(entry.type)) { logger.warn("TokensRegexNERAnnotator " + annotatorName + ": Entry doesn't have overwriteable types " + entry + ", but entry type is in noDefaultOverwriteLabels"); } @@ -749,12 +570,6 @@ private static List readEntries(String annotatorName, return entries; } - private static boolean hasNoOverwritableType(Set noDefaultOverwriteLabels, String[] types) { - for (String type:types) { - if (noDefaultOverwriteLabels.contains(type)) return true; - } - return false; - } @Override public Set requires() { diff --git a/src/edu/stanford/nlp/pipeline/demo/StanfordCoreNlpDemo.java b/src/edu/stanford/nlp/pipeline/demo/StanfordCoreNlpDemo.java index cf77739b24..977857a9c9 100644 --- a/src/edu/stanford/nlp/pipeline/demo/StanfordCoreNlpDemo.java +++ b/src/edu/stanford/nlp/pipeline/demo/StanfordCoreNlpDemo.java @@ -10,7 +10,6 @@ import edu.stanford.nlp.pipeline.*; import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations; -import edu.stanford.nlp.sentiment.SentimentCoreAnnotations; import edu.stanford.nlp.trees.*; import edu.stanford.nlp.util.*; @@ -31,20 +30,14 @@ public static void main(String[] args) throws IOException { xmlOut = new PrintWriter(args[2]); } - // Create a CoreNLP pipeline. To build the default pipeline, you can just use: - // StanfordCoreNLP pipeline = new StanfordCoreNLP(props); - // Here's a more complex setup example: - // Properties props = new Properties(); - // props.put("annotators", "tokenize, ssplit, pos, lemma, ner, depparse"); - // props.put("ner.model", "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"); - // props.put("ner.applyNumericClassifiers", "false"); - // StanfordCoreNLP pipeline = new StanfordCoreNLP(props); - - // Add in sentiment - Properties props = new Properties(); - props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment"); - - StanfordCoreNLP pipeline = new StanfordCoreNLP(props); + // Create a CoreNLP pipeline. This line just builds the default pipeline. + // In comments we show how you can build a particular pipeline + // Properties props = new Properties(); + // props.put("annotators", "tokenize, ssplit, pos, lemma, ner, depparse"); + // props.put("ner.model", "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz"); + // props.put("ner.applyNumericClassifiers", "false"); + // StanfordCoreNLP pipeline = new StanfordCoreNLP(props); + StanfordCoreNLP pipeline = new StanfordCoreNLP(); // Initialize an Annotation with some text to be annotated. The text is the argument to the constructor. Annotation annotation; @@ -57,7 +50,7 @@ public static void main(String[] args) throws IOException { // run all the selected Annotators on this text pipeline.annotate(annotation); - // this prints out the results of sentence analysis to file(s) in good formats + // print the results to file(s) pipeline.prettyPrint(annotation, out); if (xmlOut != null) { pipeline.xmlPrint(annotation, xmlOut); @@ -69,16 +62,12 @@ public static void main(String[] args) throws IOException { out.println(); out.println("The top level annotation"); out.println(annotation.toShorterString()); - out.println(); - // An Annotation is a Map with Class keys for the linguistic analysis types. - // You can get and use the various analyses individually. + // An Annotation is a Map and you can get and use the various analyses individually. // For instance, this gets the parse tree of the first sentence in the text. List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class); if (sentences != null && ! sentences.isEmpty()) { CoreMap sentence = sentences.get(0); - out.println("The keys of the first sentence's CoreMap are:"); - out.println(sentence.keySet()); out.println(); out.println("The first sentence is:"); out.println(sentence.toShorterString()); @@ -116,9 +105,6 @@ public static void main(String[] args) throws IOException { ", " + tokens.get(m.endIndex - 2).endPosition() + ")"); } } - out.println(); - - out.println("The first sentence overall sentiment rating is " + sentence.get(SentimentCoreAnnotations.SentimentClass.class)); } IOUtils.closeIgnoringExceptions(out); IOUtils.closeIgnoringExceptions(xmlOut); diff --git a/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.css b/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.css index c133ced222..61e46460bb 100644 --- a/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.css +++ b/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.css @@ -10,14 +10,6 @@ padding: 0 0 6ex; } -.pattern_tab { - margin: 1ex; -} - -.pattern_brat { - margin-top: 1ex; -} - .footer { bottom: 0; width: 100%; diff --git a/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.html b/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.html index 0c22bf5819..846bbd3fd1 100644 --- a/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.html +++ b/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.html @@ -14,16 +14,6 @@ - - - @@ -44,7 +34,7 @@
    - +
    @@ -54,9 +44,9 @@ - + - + @@ -69,9 +59,6 @@
    - -
    -
    - - - -
    diff --git a/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.js b/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.js index 8291a00cac..4503a445af 100644 --- a/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.js +++ b/src/edu/stanford/nlp/pipeline/demo/corenlp-brat.js @@ -2,7 +2,7 @@ // and uses brat to render everything. //var serverAddress = 'http://localhost:9000/' -var serverAddress = '' +var serverAddress = '/' // Load Brat libraries var bratLocation = 'http://nlp.stanford.edu/js/brat'; @@ -23,10 +23,6 @@ head.js( bratLocation + '/client/src/visualizer.js' ); -var currentQuery = 'The quick brown fox jumped over the lazy dog.'; -var currentSentences = ''; -var currentText = ''; - // ---------------------------------------------------------------------------- // HELPERS // ---------------------------------------------------------------------------- @@ -41,10 +37,6 @@ if (typeof String.prototype.startsWith != 'function') { }; } -function isInt(value) { - return !isNaN(value) && (function(x) { return (x | 0) === x; })(parseFloat(value)) -} - /** * A reverse map of PTB tokens to their original gloss */ @@ -193,22 +185,21 @@ function render(data) { // // Construct text of annotation // - currentText = []; // GLOBAL - currentSentences = data.sentences; // GLOBAL + var text = []; data.sentences.forEach(function(sentence) { for (var i = 0; i < sentence.tokens.length; ++i) { var token = sentence.tokens[i]; var word = token.word; - if (i > 0) { currentText.push(' '); } - token.characterOffsetBegin = currentText.length; + if (i > 0) { text.push(' '); } + token.characterOffsetBegin = text.length; for (var j = 0; j < word.length; ++j) { - currentText.push(word[j]); + text.push(word[j]); } - token.characterOffsetEnd = currentText.length; + token.characterOffsetEnd = text.length; } - currentText.push('\n'); + text.push('\n'); }); - currentText = currentText.join(''); + text = text.join(''); // // Shared variables @@ -405,7 +396,7 @@ function render(data) { if ($('#' + container).length > 0) { Util.embed(container, {entity_types: entityTypes, relation_types: relationTypes}, - {text: currentText, entities: entities, relations: relations} + {text: text, entities: entities, relations: relations} ); } } @@ -424,146 +415,6 @@ function render(data) { } // End render function -/** - * Render a TokensRegex response - */ -function renderTokensregex(data) { - /** - * Register an entity type (a tag) for Brat - */ - var entityTypesSet = {}; - var entityTypes = []; - function addEntityType(type, color) { - // Don't add duplicates - if (entityTypesSet[type]) return; - entityTypesSet[type] = true; - // Set the color - if (typeof color == 'undefined') { - color = '#ADF6A2'; - } - // Register the type - entityTypes.push({ - type: type, - labels : [type], - bgColor: color, - borderColor: 'darken' - }); - } - - var entities = []; - for (var sentI = 0; sentI < data.sentences.length; ++sentI) { - var tokens = currentSentences[sentI].tokens; - for (var matchI = 0; matchI < data.sentences[sentI].length; ++matchI) { - var match = data.sentences[sentI][matchI]; - // Add groups - for (groupName in match) { - if (groupName.startsWith("$") || isInt(groupName)) { - addEntityType(groupName, '#FFFDA8'); - var begin = parseInt(tokens[match[groupName].begin].characterOffsetBegin); - var end = parseInt(tokens[match[groupName].end - 1].characterOffsetEnd); - entities.push(['TOK_' + sentI + '_' + matchI + '_' + groupName, - groupName, - [[begin, end]]]); - } - } - // Add match - addEntityType('match', '#ADF6A2'); - var begin = parseInt(tokens[match.begin].characterOffsetBegin); - var end = parseInt(tokens[match.end - 1].characterOffsetEnd); - entities.push(['TOK_' + sentI + '_' + matchI + '_match', - 'match', - [[begin, end]]]); - } - } - - Util.embed('tokensregex', - {entity_types: entityTypes, relation_types: []}, - {text: currentText, entities: entities, relations: []} - ); -} // END renderTokensregex() - - -/** - * Render a Semgrex response - */ -function renderSemgrex(data) { - console.log(data); - /** - * Register an entity type (a tag) for Brat - */ - var entityTypesSet = {}; - var entityTypes = []; - function addEntityType(type, color) { - // Don't add duplicates - if (entityTypesSet[type]) return; - entityTypesSet[type] = true; - // Set the color - if (typeof color == 'undefined') { - color = '#ADF6A2'; - } - // Register the type - entityTypes.push({ - type: type, - labels : [type], - bgColor: color, - borderColor: 'darken' - }); - } - - - relationTypes = [{ - type: 'semgrex', - labels: ['-'], - dashArray: '3,3', - arrowHead: 'none', - }]; - - var entities = []; - var relations = [] - - for (var sentI = 0; sentI < data.sentences.length; ++sentI) { - var tokens = currentSentences[sentI].tokens; - for (var matchI = 0; matchI < data.sentences[sentI].length; ++matchI) { - var match = data.sentences[sentI][matchI]; - // Add match - addEntityType('match', '#ADF6A2'); - var begin = parseInt(tokens[match.begin].characterOffsetBegin); - var end = parseInt(tokens[match.end - 1].characterOffsetEnd); - entities.push(['SEM_' + sentI + '_' + matchI + '_match', - 'match', - [[begin, end]]]); - - // Add groups - for (groupName in match) { - if (groupName.startsWith("$") || isInt(groupName)) { - // (add node) - group = match[groupName]; - groupName = groupName.substring(1); - addEntityType(groupName, '#FFFDA8'); - var begin = parseInt(tokens[group.begin].characterOffsetBegin); - var end = parseInt(tokens[group.end - 1].characterOffsetEnd); - entities.push(['SEM_' + sentI + '_' + matchI + '_' + groupName, - groupName, - [[begin, end]]]); - - // (add relation) - relations.push(['SEMGREX_' + sentI + '_' + matchI + '_' + groupName, - 'semgrex', - [['governor', 'SEM_' + sentI + '_' + matchI + '_match'], - ['dependent', 'SEM_' + sentI + '_' + matchI + '_' + groupName] ] ]); - } - } - } - } - - console.log(relations); - Util.embed('semgrex', - {entity_types: entityTypes, relation_types: relationTypes}, - {text: currentText, entities: entities, relations: relations} - ); -} // END renderSemgrex - - // ---------------------------------------------------------------------------- // MAIN // ---------------------------------------------------------------------------- @@ -580,22 +431,21 @@ $(document).ready(function() { $('#submit').click(function() { // Get the text to annotate - currentQuery = $('#text').val(); - if (currentQuery == '') { - currentQuery = 'The quick brown fox jumped over the lazy dog.'; - $('#text').val(currentQuery); + text = $('#text').val(); + if (text == '') { + text = 'My dog also likes eating sausage.'; + $('#text').val(text); } // Update the UI $('#submit').prop('disabled', true); $('#annotations').hide(); - $('#patterns_row').hide(); $('#loading').show(); // Run query $.ajax({ type: 'POST', url: serverAddress + '?properties=' + encodeURIComponent('{"annotators": "' + annotators() + '"}'), - data: currentQuery, + data: text, success: function(data) { $('#submit').prop('disabled', false); if (typeof data == undefined || data.sentences == undefined) { @@ -636,88 +486,10 @@ $(document).ready(function() { createAnnotationDiv('coref', 'dcoref', 'corefs', 'Coreference' ); // Update UI $('#loading').hide(); - $('.corenlp_error').remove(); // Clear error messages $('#annotations').show(); // Render render(data); - // Render patterns - $('#annotations').append('

    CoreNLP Tools:

    '); // TODO(gabor) a strange place to add this header to - $('#patterns_row').show(); } - }, - error: function(data) { - var alertDiv = $('
    ').addClass('alert').addClass('alert-danger').addClass('alert-dismissible').addClass('corenlp_error').attr('role', 'alert') - var button = $(''); - var message = $('').text(data.responseText); - button.appendTo(alertDiv); - message.appendTo(alertDiv); - $('#loading').hide(); - alertDiv.appendTo($('#errors')); - $('#submit').prop('disabled', false); - } - }); - }); - - - $('#form_tokensregex').submit( function (e) { - // Don't actually submit the form - e.preventDefault(); - // Get text - if ($('#tokensregex_search').val().trim() == '') { - $('#tokensregex_search').val('(?$foxtype [{pos:JJ}]* ) fox'); - } - var pattern = $('#tokensregex_search').val(); - // Remove existing annotation - $('#tokensregex').remove(); - // Make ajax call - $.ajax({ - type: 'POST', - url: serverAddress + 'tokensregex?pattern=' + encodeURIComponent(pattern.replace("&", "\\&")), - data: currentQuery, - success: function(data) { - $('.tokensregex_error').remove(); // Clear error messages - $('
    ').appendTo($('#div_tokensregex')); - renderTokensregex(data); - }, - error: function(data) { - var alertDiv = $('
    ').addClass('alert').addClass('alert-danger').addClass('alert-dismissible').addClass('tokensregex_error').attr('role', 'alert') - var button = $(''); - var message = $('').text(data.responseText); - button.appendTo(alertDiv); - message.appendTo(alertDiv); - alertDiv.appendTo($('#div_tokensregex')); - } - }); - }); - - - $('#form_semgrex').submit( function (e) { - // Don't actually submit the form - e.preventDefault(); - // Get text - if ($('#semgrex_search').val().trim() == '') { - $('#semgrex_search').val('{pos:/VB.*/} >nsubj {}=subject >/nmod:.*/ {}=prep_phrase'); - } - var pattern = $('#semgrex_search').val(); - // Remove existing annotation - $('#semgrex').remove(); - // Make ajax call - $.ajax({ - type: 'POST', - url: serverAddress + 'semgrex?pattern=' + encodeURIComponent(pattern.replace("&", "\\&")), - data: currentQuery, - success: function(data) { - $('.semgrex_error').remove(); // Clear error messages - $('
    ').appendTo($('#div_semgrex')); - renderSemgrex(data); - }, - error: function(data) { - var alertDiv = $('
    ').addClass('alert').addClass('alert-danger').addClass('alert-dismissible').addClass('semgrex_error').attr('role', 'alert') - var button = $(''); - var message = $('').text(data.responseText); - button.appendTo(alertDiv); - message.appendTo(alertDiv); - alertDiv.appendTo($('#div_semgrex')); } }); }); diff --git a/src/edu/stanford/nlp/semgraph/SemanticGraph.java b/src/edu/stanford/nlp/semgraph/SemanticGraph.java index 09846d0737..05e7d911be 100644 --- a/src/edu/stanford/nlp/semgraph/SemanticGraph.java +++ b/src/edu/stanford/nlp/semgraph/SemanticGraph.java @@ -1124,30 +1124,6 @@ public int size() { return this.vertexSet().size(); } - - /** - * Returns all nodes reachable from root. - * - * @param root the root node of the subgraph - * @return all nodes in subgraph - */ - public Set getSubgraphVertices(IndexedWord root) { - Set result = wordMapFactory.newSet(); - result.add(root); - List queue = Generics.newLinkedList(); - queue.add(root); - while (! queue.isEmpty()) { - IndexedWord current = queue.remove(0); - for (IndexedWord child : this.getChildren(current)) { - if ( ! result.contains(child)) { - result.add(child); - queue.add(child); - } - } - } - return result; - } - /** * @return true if the graph contains no cycles. */ @@ -1164,26 +1140,6 @@ public boolean isDag() { return true; } - /** - * - * @param root root node of the subgraph. - * @return true if the subgraph rooted at root contains no cycles. - */ - - public boolean isDag(IndexedWord root) { - Set unused = wordMapFactory.newSet(); - unused.addAll(this.getSubgraphVertices(root)); - while (!unused.isEmpty()) { - IndexedWord arbitrary = unused.iterator().next(); - boolean result = isDagHelper(arbitrary, unused, wordMapFactory.newSet()); - if (result) { - return false; - } - } - return true; - } - - private boolean isDagHelper(IndexedWord current, Set unused, Set trail) { if (trail.contains(current)) { return true; @@ -1467,12 +1423,12 @@ public String toList() { StringBuilder buf = new StringBuilder(); for (IndexedWord root : getRoots()) { buf.append("root(ROOT-0, "); - buf.append(root.toString(CoreLabel.OutputFormat.VALUE_INDEX)).append(")\n"); + buf.append(toDepStyle(root)).append(")\n"); } for (SemanticGraphEdge edge : this.edgeListSorted()) { buf.append(edge.getRelation().toString()).append("("); - buf.append(edge.getSource().toString(CoreLabel.OutputFormat.VALUE_INDEX)).append(", "); - buf.append(edge.getTarget().toString(CoreLabel.OutputFormat.VALUE_INDEX)).append(")\n"); + buf.append(toDepStyle(edge.getSource())).append(", "); + buf.append(toDepStyle(edge.getTarget())).append(")\n"); } return buf.toString(); } @@ -1484,24 +1440,32 @@ public String toPOSList() { StringBuilder buf = new StringBuilder(); for (SemanticGraphEdge edge : this.edgeListSorted()) { buf.append(edge.getRelation().toString()).append("("); - buf.append(edge.getSource().toString()).append(","); - buf.append(edge.getTarget()).append(")\n"); + buf.append(toPOSStyle(edge.getSource())).append(","); + buf.append(toPOSStyle(edge.getTarget())).append(")\n"); } return buf.toString(); } + // todo [cdm 2013]: These next two methods should really be toString options on indexed word but are different from all the current ones.... + + private static String toDepStyle(IndexedWord fl) { + return fl.toString(CoreLabel.OutputFormat.VALUE_INDEX); + } + + private static String toPOSStyle(IndexedWord fl) { + return fl.toString(CoreLabel.OutputFormat.VALUE_TAG_INDEX); + } + private String toReadableString() { StringBuilder buf = new StringBuilder(); buf.append(String.format("%-20s%-20s%-20s%n", "dep", "reln", "gov")); buf.append(String.format("%-20s%-20s%-20s%n", "---", "----", "---")); for (IndexedWord root : getRoots()) { - buf.append(String.format("%-20s%-20s%-20s%n", root.toString(CoreLabel.OutputFormat.VALUE_TAG_INDEX), "root", "root")); + buf.append(String.format("%-20s%-20s%-20s%n", toDepStyle(root), "root", "root")); } for (SemanticGraphEdge edge : this.edgeListSorted()) { - buf.append(String.format("%-20s%-20s%-20s%n", - edge.getTarget().toString(CoreLabel.OutputFormat.VALUE_TAG_INDEX), - edge.getRelation().toString(), - edge.getSource().toString(CoreLabel.OutputFormat.VALUE_TAG_INDEX))); + buf.append(String.format("%-20s%-20s%-20s%n", toDepStyle(edge.getTarget()), edge.getRelation().toString(), + toDepStyle(edge.getSource()))); } return buf.toString(); } diff --git a/src/edu/stanford/nlp/semgraph/semgrex/GraphRelation.java b/src/edu/stanford/nlp/semgraph/semgrex/GraphRelation.java index ad2565327d..8cb71b340e 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/GraphRelation.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/GraphRelation.java @@ -27,23 +27,23 @@ * careful to make the appropriate adjustments to * getRelation(). Finally, if you are using the SemgrexParser, you * need to add the new relation symbol to the list of tokens.

    - * + * * @author Chloe Kiddon */ abstract class GraphRelation implements Serializable { final String symbol; final Predicate type; final String rawType; - + final String name; - + //"<" | ">" | ">>" | "<<" | "<#" | ">#" | ":" | "@"> /** * Returns true iff this GraphRelation holds between * the given pair of nodes in the given semantic graph. - */ + */ abstract boolean satisfies(IndexedWord n1, IndexedWord n2, SemanticGraph sg); /** @@ -58,20 +58,20 @@ private GraphRelation(String symbol, String type, String name) { this.rawType = type; this.name = name; } - + private GraphRelation(String symbol, String type) { this(symbol, type, null); } - + private GraphRelation(String symbol) { this(symbol, null); } - + @Override public String toString() { return symbol + ((rawType != null) ? rawType : "") + ((name != null) ? "=" + name : ""); } - + public Predicate getPattern(String relnType) { if ((relnType == null) || (relnType.equals(""))) { @@ -82,12 +82,12 @@ public Predicate getPattern(String relnType) return new ArrayStringFilter(ArrayStringFilter.Mode.EXACT, relnType); } } - + public String getName() { if (name == null || name == "") return null; return name; } - + // ALIGNMENT graph relation: "@" ============================================== @@ -100,14 +100,14 @@ static class ALIGNMENT extends GraphRelation { super("@", ""); hypToText = true; } - + void setAlignment(Alignment alignment, boolean hypToText, SearchNodeIterator itr) { this.alignment = alignment; this.hypToText = hypToText; //System.err.println("setting alignment"); itr.advance(); } - + @Override boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { if (alignment == null) return false; @@ -121,14 +121,14 @@ boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { @Override Iterator searchNodeIterator(final IndexedWord node, final SemanticGraph sg) { return new SearchNodeIterator() { - + boolean foundOnce = false; int nextNum; - + // not really initialized until alignment is set @Override public void initialize() { - + } @Override @@ -148,7 +148,7 @@ public void advance() { //System.err.println("next: null"); } } else { - + int num = 0; for (Map.Entry pair : alignment.getMap().entrySet()) { if (pair.getValue().equals(node)) { @@ -165,23 +165,23 @@ public void advance() { next = null; } } - + }; } - - // Generated automatically by Eclipse + + // Generated automatically by Eclipse private static final long serialVersionUID = -2936526066368043778L; }; - + // ROOT graph relation: "Root" ================================================ - + static final GraphRelation ROOT = new GraphRelation("", "") { @Override boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { return l1 == l2; } - + @Override Iterator searchNodeIterator(final IndexedWord node, final SemanticGraph sg) { return new SearchNodeIterator() { @@ -201,17 +201,17 @@ boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { return true; } - Iterator searchNodeIterator(final IndexedWord node, + Iterator searchNodeIterator(final IndexedWord node, final SemanticGraph sg) { return sg.vertexSet().iterator(); } // automatically generated by Eclipse private static final long serialVersionUID = 5259713498453659251L; }; - + // ALIGNED_ROOT graph relation: "AlignRoot" =================================== - + static final GraphRelation ALIGNED_ROOT = new GraphRelation("AlignRoot", "") { @Override boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { @@ -227,26 +227,26 @@ void initialize() { } }; } - + // automatically generated by Eclipse - private static final long serialVersionUID = -3088857488269777611L; + private static final long serialVersionUID = -3088857488269777611L; }; // GOVERNOR graph relation: ">" =============================================== - + static private class GOVERNER extends GraphRelation { GOVERNER(String reln, String name) { super(">", reln, name); } - + @Override boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { List> deps = sg.childPairs(l1); for (Pair dep : deps) { if (this.type.test(dep.first().toString()) && dep.second().equals(l2)) { - return true; + return true; } } return false; @@ -279,14 +279,14 @@ public void advance() { } }; } - + // automatically generated by Eclipse private static final long serialVersionUID = -7003148918274183951L; }; - + // DEPENDENT graph relation: "<" =============================================== - + static private class DEPENDENT extends GraphRelation { DEPENDENT(String reln, String name) { super("<", reln, name); @@ -294,12 +294,12 @@ static private class DEPENDENT extends GraphRelation { @Override boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { - if (l1.equals(IndexedWord.NO_WORD) || l2.equals(IndexedWord.NO_WORD) ) + if (l1.equals(IndexedWord.NO_WORD) || l2.equals(IndexedWord.NO_WORD) ) return false; List> govs = sg.parentPairs(l1); for (Pair gov : govs) { if (this.type.test(gov.first().toString()) && - gov.second().equals(l2)) return true; + gov.second().equals(l2)) return true; } return false; } @@ -339,21 +339,21 @@ public void advance() { // automatically generated by Eclipse private static final long serialVersionUID = -5115389883698108694L; }; - + static private class LIMITED_GRANDPARENT extends GraphRelation { final int startDepth, endDepth; - - LIMITED_GRANDPARENT(String reln, String name, + + LIMITED_GRANDPARENT(String reln, String name, int startDepth, int endDepth) { super(startDepth + "," + endDepth + ">>", reln, name); this.startDepth = startDepth; this.endDepth = endDepth; } - + @Override boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { - if (l1.equals(IndexedWord.NO_WORD) || l2.equals(IndexedWord.NO_WORD) ) + if (l1.equals(IndexedWord.NO_WORD) || l2.equals(IndexedWord.NO_WORD) ) return false; List> usedNodes = new ArrayList>(); for (int i = 0; i <= endDepth; ++i) { @@ -361,7 +361,7 @@ boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { } return l1 != l2 && satisfyHelper(l1, l2, sg, 0, usedNodes); } - + private boolean satisfyHelper(IndexedWord parent, IndexedWord l2, SemanticGraph sg, @@ -374,15 +374,15 @@ private boolean satisfyHelper(IndexedWord parent, if (depth + 1 >= startDepth) { for (Pair dep : deps) { if (this.type.test(dep.first().toString()) && - dep.second().equals(l2)) return true; + dep.second().equals(l2)) return true; } } - + usedNodes.get(depth).add(parent); - + for (Pair dep : deps) { - if ((usedNodes.size() < depth + 1 || - !usedNodes.get(depth + 1).contains(dep.second())) && + if ((usedNodes.size() < depth + 1 || + !usedNodes.get(depth + 1).contains(dep.second())) && satisfyHelper(dep.second(), l2, sg, depth + 1, usedNodes)) return true; } @@ -447,12 +447,12 @@ void advance() { if (thisSeen.contains(nextPair.second())) { continue; } - + thisSeen.add(nextPair.second()); - List> children = + List> children = sg.childPairs(nextPair.second()); for (int i = children.size() - 1; i >= 0; i--) { - if (nextSeen != null && + if (nextSeen != null && !nextSeen.contains(children.get(i).second())) nextStack.push(children.get(i)); } @@ -473,7 +473,7 @@ void advance() { } }; } - + // automatically generated by Eclipse private static final long serialVersionUID = 1L; }; @@ -484,7 +484,7 @@ void advance() { *
    * In general, the only differences are which ways to go on edges, * so that is gotten through abstract methods - */ + */ static private abstract class GRANDSOMETHING extends GraphRelation { GRANDSOMETHING(String symbol, String reln, String name) { super(symbol, reln, name); @@ -495,22 +495,22 @@ static private abstract class GRANDSOMETHING extends GraphRelation { abstract Iterator neighborIterator(SemanticGraph sg, IndexedWord search); abstract IndexedWord followEdge(SemanticGraphEdge edge); - + @Override boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { return l1 != l2 && satisfyHelper(l1, l2, sg, Generics.newIdentityHashSet()); } - + private boolean satisfyHelper(IndexedWord node, IndexedWord l2, SemanticGraph sg, Set usedNodes) { List> govs = getNeighborPairs(sg, node); for (Pair gov : govs) { if (this.type.test(gov.first().toString()) && - gov.second().equals(l2)) return true; + gov.second().equals(l2)) return true; } - + usedNodes.add(node); - + for (Pair gov : govs) { if (!usedNodes.contains(gov.second()) && satisfyHelper(gov.second(), l2, sg, usedNodes)) return true; @@ -526,7 +526,7 @@ Iterator searchNodeIterator(final IndexedWord node, final SemanticG Set matchedNodes; Iterator neighborIterator; - + @Override public void initialize() { if (node.equals(IndexedWord.NO_WORD)) { @@ -574,13 +574,13 @@ void advance() { } }; } - - // automatically generated by Eclipse + + // automatically generated by Eclipse private static final long serialVersionUID = 1L; }; - + // GRANDPARENT graph relation: ">>" =========================================== - + static private class GRANDPARENT extends GRANDSOMETHING { GRANDPARENT(String reln, String name) { super(">>", reln, name); @@ -600,13 +600,13 @@ Iterator neighborIterator(SemanticGraph sg, IndexedWord searc IndexedWord followEdge(SemanticGraphEdge edge) { return edge.getTarget(); } - + // automatically generated by Eclipse private static final long serialVersionUID = 1L; } - + // GRANDKID graph relation: "<<" ============================================== - + static private class GRANDKID extends GRANDSOMETHING { GRANDKID(String reln, String name) { super("<<", reln, name); @@ -626,25 +626,25 @@ Iterator neighborIterator(SemanticGraph sg, IndexedWord searc IndexedWord followEdge(SemanticGraphEdge edge) { return edge.getSource(); } - + // automatically generated by copying some other serialVersionUID private static final long serialVersionUID = 1L; } - + static private class LIMITED_GRANDKID extends GraphRelation { final int startDepth, endDepth; - - LIMITED_GRANDKID(String reln, String name, + + LIMITED_GRANDKID(String reln, String name, int startDepth, int endDepth) { super(startDepth + "," + endDepth + "<<", reln, name); this.startDepth = startDepth; this.endDepth = endDepth; } - + @Override boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { - if (l1.equals(IndexedWord.NO_WORD) || l2.equals(IndexedWord.NO_WORD) ) + if (l1.equals(IndexedWord.NO_WORD) || l2.equals(IndexedWord.NO_WORD) ) return false; List> usedNodes = new ArrayList>(); for (int i = 0; i <= endDepth; ++i) { @@ -652,7 +652,7 @@ boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { } return l1 != l2 && satisfyHelper(l1, l2, sg, 0, usedNodes); } - + private boolean satisfyHelper(IndexedWord child, IndexedWord l2, SemanticGraph sg, @@ -665,15 +665,15 @@ private boolean satisfyHelper(IndexedWord child, if (depth + 1 >= startDepth) { for (Pair dep : deps) { if (this.type.test(dep.first().toString()) && - dep.second().equals(l2)) return true; + dep.second().equals(l2)) return true; } } - + usedNodes.get(depth).add(child); - + for (Pair dep : deps) { - if ((usedNodes.size() < depth + 1 || - !usedNodes.get(depth + 1).contains(dep.second())) && + if ((usedNodes.size() < depth + 1 || + !usedNodes.get(depth + 1).contains(dep.second())) && satisfyHelper(dep.second(), l2, sg, depth + 1, usedNodes)) return true; } @@ -738,12 +738,12 @@ void advance() { if (thisSeen.contains(nextPair.second())) { continue; } - + thisSeen.add(nextPair.second()); - List> parents = + List> parents = sg.parentPairs(nextPair.second()); for (int i = parents.size() - 1; i >= 0; i--) { - if (nextSeen != null && + if (nextSeen != null && !nextSeen.contains(parents.get(i).second())) nextStack.push(parents.get(i)); } @@ -764,7 +764,7 @@ void advance() { } }; } - + // automatically generated by Eclipse private static final long serialVersionUID = 1L; }; @@ -803,177 +803,14 @@ public void advance() { } }; } - } - - static private abstract class SIBLING_RELATION extends GraphRelation { - - private static final long serialVersionUID = 1L; - - SIBLING_RELATION(String symbol, String reln, String name) { - super(symbol, reln, name); - } - - abstract boolean satisfiesOrder(IndexedWord l1, IndexedWord l2); - - @Override - boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { - IndexedWord parent = sg.getCommonAncestor(l1, l2); - Set l1Parents = sg.getParents(l1); - if (parent != null - && l1Parents.contains(parent) - && satisfiesOrder(l1, l2)) { - return true; - } - return false; - } - - @Override - Iterator searchNodeIterator(final IndexedWord node, final SemanticGraph sg) { - return new SearchNodeIterator() { - Iterator iterator; - - @Override - public void advance() { - if (node.equals(IndexedWord.NO_WORD)) { - next = null; - return; - } - - if (iterator == null) { - Set parents = sg.getParents(node); - Set neighbors = Generics.newIdentityHashSet(); - for (IndexedWord parent : parents) { - neighbors.addAll(sg.getChildren(parent)); - } - iterator = neighbors.iterator(); - } - - while (iterator.hasNext()) { - IndexedWord word = iterator.next(); - if ( ! satisfiesOrder(node, word)) { - continue; - } - this.next = word; - return; - } - this.next = null; - } - }; - } - - } - - - static private class RIGHT_IMMEDIATE_SIBLING extends SIBLING_RELATION { - - RIGHT_IMMEDIATE_SIBLING(String reln, String name) { - super("$+", reln, name); - } - - private static final long serialVersionUID = 1L; - - boolean satisfiesOrder(IndexedWord l1, IndexedWord l2) { - return (l1.index() == (l2.index() - 1)); - } - } - - static private class LEFT_IMMEDIATE_SIBLING extends SIBLING_RELATION { - - LEFT_IMMEDIATE_SIBLING(String reln, String name) { - super("$-", reln, name); - } - - private static final long serialVersionUID = 1L; - - boolean satisfiesOrder(IndexedWord l1, IndexedWord l2) { - return (l1.index() == (l2.index() + 1)); - } - } - - static private class RIGHT_SIBLING extends SIBLING_RELATION { - - RIGHT_SIBLING(String reln, String name) { - super("$++", reln, name); - } - - private static final long serialVersionUID = 1L; - - boolean satisfiesOrder(IndexedWord l1, IndexedWord l2) { - return (l1.index() < l2.index()); - } - } - - static private class LEFT_SIBLING extends SIBLING_RELATION { - - LEFT_SIBLING(String reln, String name) { - super("$--", reln, name); - } - - private static final long serialVersionUID = 1L; - - boolean satisfiesOrder(IndexedWord l1, IndexedWord l2) { - return (l1.index() > l2.index()); - } - } - - static private class ADJACENT_NODE extends GraphRelation { - - private static final long serialVersionUID = 1L; - - ADJACENT_NODE(String reln, String name) { - super(".", reln, name); - } - - - @Override - boolean satisfies(IndexedWord l1, IndexedWord l2, SemanticGraph sg) { - if (l1.index() == (l2.index() - 1)) { - return true; - } - return false; - } - - @Override - Iterator searchNodeIterator(final IndexedWord node, final SemanticGraph sg) { - return new SearchNodeIterator() { - Iterator iterator; - - @Override - public void advance() { - if (node.equals(IndexedWord.NO_WORD)) { - next = null; - return; - } - - if (iterator == null) { - iterator = sg.vertexSet().iterator(); - } - - while (iterator.hasNext()) { - IndexedWord word = iterator.next(); - if (node.index() != (word.index() - 1)) { - continue; - } - this.next = word; - return; - } - this.next = null; - } - }; - } - - } - + } // ============================================================================ - + public static boolean isKnownRelation(String reln) { - return (reln.equals(">") || reln.equals("<") || + return (reln.equals(">") || reln.equals("<") || reln.equals(">>") || reln.equals("<<") || - reln.equals("@") || reln.equals("==") || - reln.equals("$+") || reln.equals("$++") || - reln.equals("$-") || reln.equals("$--") || - reln.equals(".")); + reln.equals("@") || reln.equals("==")); } public static GraphRelation getRelation(String reln, @@ -995,16 +832,6 @@ public static GraphRelation getRelation(String reln, return new GRANDKID(type, name); case "==": return new EQUALS(type, name); - case "$+": - return new RIGHT_IMMEDIATE_SIBLING(type, name); - case "$-": - return new LEFT_IMMEDIATE_SIBLING(type, name); - case "$++": - return new RIGHT_SIBLING(type, name); - case "$--": - return new LEFT_SIBLING(type, name); - case ".": - return new ADJACENT_NODE(type, name); case "@": return new ALIGNMENT(); default: @@ -1013,7 +840,7 @@ public static GraphRelation getRelation(String reln, " not handled by getRelation"); } } - + public static GraphRelation getRelation(String reln, String type, int num, @@ -1025,13 +852,13 @@ public static GraphRelation getRelation(String reln, else if (reln.equals("<<")) return new LIMITED_GRANDKID(type, name, num, num); else if (isKnownRelation(reln)) - throw new ParseException("Relation " + reln + + throw new ParseException("Relation " + reln + " does not use numeric arguments"); else //error throw new ParseException("Unrecognized compound relation " + reln + " " + type); } - + public static GraphRelation getRelation(String reln, String type, int num, int num2, @@ -1043,18 +870,18 @@ public static GraphRelation getRelation(String reln, else if (reln.equals("<<")) return new LIMITED_GRANDKID(type, name, num, num2); else if (isKnownRelation(reln)) - throw new ParseException("Relation " + reln + + throw new ParseException("Relation " + reln + " does not use numeric arguments"); else //error throw new ParseException("Unrecognized compound relation " + reln + " " + type); } - + @Override public int hashCode() { return symbol.hashCode(); } - + @Override public boolean equals(Object o) { if (this == o) { @@ -1065,7 +892,7 @@ public boolean equals(Object o) { } final GraphRelation relation = (GraphRelation) o; - + if (!symbol.equals(relation.symbol) || !type.equals(relation.type)) { return false; @@ -1073,7 +900,7 @@ public boolean equals(Object o) { return true; } - + /** * This abstract Iterator implements a NULL iterator, but by subclassing and * overriding advance and/or initialize, it is an efficient implementation. @@ -1093,7 +920,7 @@ public SearchNodeIterator() { * Current relation string for next; */ String relation = null; - + /** * This method must insure that next points to first item, or null if there * are no items. @@ -1122,14 +949,14 @@ public IndexedWord next() { advance(); return ret; } - + String getReln() {return relation;} public void remove() { throw new UnsupportedOperationException("SearchNodeIterator does not support remove()."); } } - + // Automatically generated by Eclipse private static final long serialVersionUID = -9128973950911993056L; } diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ParseException.java b/src/edu/stanford/nlp/semgraph/semgrex/ParseException.java index 64dfb464c1..dfa333d2e9 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/ParseException.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/ParseException.java @@ -1,4 +1,4 @@ -/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 5.0 */ +/* Generated By:JavaCC: Do not edit this line. ParseException.java Version 6.0 */ /* JavaCCOptions:KEEP_LINE_COL=null */ package edu.stanford.nlp.semgraph.semgrex; @@ -184,4 +184,4 @@ static String add_escapes(String str) { } } -/* JavaCC - OriginalChecksum=019e10052e79215d42e06dc8aadc24a7 (do not edit this line) */ +/* JavaCC - OriginalChecksum=f884420d3e828b4d12c552a8746b830e (do not edit this line) */ diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexBatchParser.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexBatchParser.java index 6297d301ca..cb112631f4 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexBatchParser.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexBatchParser.java @@ -10,12 +10,14 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import edu.stanford.nlp.stats.ClassicCounter; +import edu.stanford.nlp.stats.Counter; import edu.stanford.nlp.util.Generics; import edu.stanford.nlp.util.Pair; /** - * Parses a batch of SemgrexPatterns from a stream. - * Each SemgrexPattern must be defined in a single line. + * Parses a batch of SemgrexPatterns from a stream + * Each SemgrexPattern must be defined in a single line * This includes a preprocessor that supports macros, defined as: "macro NAME = VALUE" and used as ${NAME} * For example: * # lines starting with the pound sign are skipped @@ -23,64 +25,61 @@ * {}=entity >appos ({lemma:/${JOB}/} >nn {ner:ORGANIZATION}=slot) */ public class SemgrexBatchParser { - /** Maximum stream size in characters */ private static final int MAX_STREAM_SIZE = 1024 * 1024; - private SemgrexBatchParser() { } // static methods class - - public static List compileStream(InputStream is) throws IOException { + public List compileStream(InputStream is) throws IOException { return compileStream(is, null); } - public static List compileStream(InputStream is, Env env) throws IOException { - BufferedReader reader = new BufferedReader(new InputStreamReader(is)); - reader.mark(MAX_STREAM_SIZE); - Map macros = preprocess(reader); - reader.reset(); - return parse(reader, macros, env); - } - - private static List parse(BufferedReader reader, Map macros, Env env) throws IOException { - List patterns = new ArrayList<>(); - for(String line; (line = reader.readLine()) != null; ) { + public List compileStream(InputStream is, Env env) throws IOException { + BufferedReader reader = new BufferedReader(new InputStreamReader(is)); + reader.mark(MAX_STREAM_SIZE); + Map macros = preprocess(reader); + reader.reset(); + return parse(reader, macros, env); + } + + private List parse(BufferedReader reader, Map macros, Env env) throws IOException { + List patterns = new ArrayList(); + for(String line; (line = reader.readLine()) != null; ) { line = line.trim(); - if(line.isEmpty() || line.startsWith("#")) continue; + if(line.length() == 0 || line.startsWith("#")) continue; if(line.startsWith("macro ")) continue; line = replaceMacros(line, macros); SemgrexPattern pattern = SemgrexPattern.compile(line, env); patterns.add(pattern); - } - return patterns; - } - - private static final Pattern MACRO_NAME_PATTERN = Pattern.compile("\\$\\{[a-z0-9]+\\}", Pattern.CASE_INSENSITIVE); - - private static String replaceMacros(String line, Map macros) { - StringBuilder out = new StringBuilder(); - Matcher matcher = MACRO_NAME_PATTERN.matcher(line); - int offset = 0; - while(matcher.find(offset)) { - int start = matcher.start(); - int end = matcher.end(); - String name = line.substring(start + 2, end - 1); - String value = macros.get(name); - if(value == null){ - throw new RuntimeException("ERROR: Unknown macro \"" + name + "\"!"); - } - if(start > offset) { - out.append(line.substring(offset, start)); - } - out.append(value); - offset = end; - } - if(offset < line.length()) out.append(line.substring(offset)); - String postProcessed = out.toString(); - if(! postProcessed.equals(line)) System.err.println("Line \"" + line + "\" changed to \"" + postProcessed + '"'); - return postProcessed; - } - - private static Map preprocess(BufferedReader reader) throws IOException { + } + return patterns; + } + + private static final Pattern MACRO_NAME_PATTERN = Pattern.compile("\\$\\{[a-z0-9]+\\}", Pattern.CASE_INSENSITIVE); + + private String replaceMacros(String line, Map macros) { + StringBuffer out = new StringBuffer(); + Matcher matcher = MACRO_NAME_PATTERN.matcher(line); + int offset = 0; + while(matcher.find(offset)) { + int start = matcher.start(); + int end = matcher.end(); + String name = line.substring(start + 2, end - 1); + String value = macros.get(name); + if(value == null){ + throw new RuntimeException("ERROR: Unknown macro \"" + name + "\"!"); + } + if(start > offset) { + out.append(line.substring(offset, start)); + } + out.append(value); + offset = end; + } + if(offset < line.length()) out.append(line.substring(offset)); + String postProcessed = out.toString(); + if(! postProcessed.equals(line)) System.err.println("Line \"" + line + "\" changed to \"" + postProcessed + "\""); + return postProcessed; + } + + private Map preprocess(BufferedReader reader) throws IOException { Map macros = Generics.newHashMap(); for(String line; (line = reader.readLine()) != null; ) { line = line.trim(); @@ -91,22 +90,21 @@ private static Map preprocess(BufferedReader reader) throws IOEx } return macros; } - - private static Pair extractMacro(String line) { + + private Pair extractMacro(String line) { assert(line.startsWith("macro")); int equalPosition = line.indexOf('='); if(equalPosition < 0) { throw new RuntimeException("ERROR: Invalid syntax in macro line: \"" + line + "\"!"); } String name = line.substring(5, equalPosition).trim(); - if(name.isEmpty()) { + if(name.length() == 0) { throw new RuntimeException("ERROR: Invalid syntax in macro line: \"" + line + "\"!"); } String value = line.substring(equalPosition + 1).trim(); - if(value.isEmpty()) { + if(value.length() == 0) { throw new RuntimeException("ERROR: Invalid syntax in macro line: \"" + line + "\"!"); } - return new Pair<>(name, value); + return new Pair(name, value); } - } diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java index 9b3f86a9ac..f0fb0a90e1 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.java @@ -1,3 +1,4 @@ +/* SemgrexParser.java */ /* Generated By:JavaCC: Do not edit this line. SemgrexParser.java */ package edu.stanford.nlp.semgraph.semgrex; // all generated classes are in this package @@ -17,98 +18,105 @@ class SemgrexParser implements SemgrexParserConstants { // lets us make sure we don't name new nodes under a negation private Set knownVariables = Generics.newHashSet(); - final public SemgrexPattern Root() throws ParseException { - SemgrexPattern node; + final public SemgrexPattern Root() throws ParseException {SemgrexPattern node; Token reverse = null; List children = new ArrayList(); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case ALIGNRELN: + // a local variable + + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case ALIGNRELN:{ reverse = jj_consume_token(ALIGNRELN); node = SubNode(GraphRelation.ALIGNED_ROOT); jj_consume_token(11); break; + } case 13: case 17: case 19: - case 23: + case 23:{ node = SubNode(GraphRelation.ROOT); - children.add(node); +children.add(node); label_1: while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 12: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 12:{ ; break; + } default: jj_la1[0] = jj_gen; break label_1; } jj_consume_token(12); node = SubNode(GraphRelation.ITERATOR); - children.add(node); +children.add(node); } jj_consume_token(11); break; + } default: jj_la1[1] = jj_gen; jj_consume_token(-1); throw new ParseException(); } - if (children.size() <= 1) - {if (true) return node;} - {if (true) return new CoordinationPattern(true, children, true);} +if (children.size() <= 1) + {if ("" != null) return node;} + {if ("" != null) return new CoordinationPattern(true, children, true);} throw new Error("Missing return statement in function"); } - final public SemgrexPattern SubNode(GraphRelation r) throws ParseException { - SemgrexPattern result = null; + final public SemgrexPattern SubNode(GraphRelation r) throws ParseException {SemgrexPattern result = null; SemgrexPattern child = null; - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 13: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 13:{ jj_consume_token(13); result = SubNode(r); jj_consume_token(14); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case RELATION: case ALIGNRELN: case IDENTIFIER: case 17: case 18: - case 19: + case 19:{ child = RelationDisj(); break; + } default: jj_la1[2] = jj_gen; ; } - if (child != null) { +if (child != null) { List newChildren = new ArrayList(); newChildren.addAll(result.getChildren()); newChildren.add(child); result.setChild(new CoordinationPattern(false, newChildren, true)); } - {if (true) return result;} + {if ("" != null) return result;} break; + } case 17: case 19: - case 23: + case 23:{ result = ModNode(r); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case RELATION: case ALIGNRELN: case IDENTIFIER: case 17: case 18: - case 19: + case 19:{ child = RelationDisj(); break; + } default: jj_la1[3] = jj_gen; ; } - if (child != null) result.setChild(child); - {if (true) return result;} +if (child != null) result.setChild(child); + {if ("" != null) return result;} break; + } default: jj_la1[4] = jj_gen; jj_consume_token(-1); @@ -117,129 +125,130 @@ final public SemgrexPattern SubNode(GraphRelation r) throws ParseException { throw new Error("Missing return statement in function"); } - final public SemgrexPattern RelationDisj() throws ParseException { - SemgrexPattern child; + final public SemgrexPattern RelationDisj() throws ParseException {SemgrexPattern child; List children = new ArrayList(); child = RelationConj(); - children.add(child); +children.add(child); label_2: while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 15: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 15:{ ; break; + } default: jj_la1[5] = jj_gen; break label_2; } jj_consume_token(15); child = RelationConj(); - children.add(child); +children.add(child); } - if (children.size() == 1) { - {if (true) return child;} - } else { - {if (true) return new CoordinationPattern(false, children, false);} - } +if (children.size() == 1) + {if ("" != null) return child;} + else + {if ("" != null) return new CoordinationPattern(false, children, false);} throw new Error("Missing return statement in function"); } - final public SemgrexPattern RelationConj() throws ParseException { - SemgrexPattern child; + final public SemgrexPattern RelationConj() throws ParseException {SemgrexPattern child; List children = new ArrayList(); child = ModRelation(); - children.add(child); +children.add(child); label_3: while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case RELATION: case ALIGNRELN: case IDENTIFIER: case 16: case 17: case 18: - case 19: + case 19:{ ; break; + } default: jj_la1[6] = jj_gen; break label_3; } - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 16: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 16:{ jj_consume_token(16); break; + } default: jj_la1[7] = jj_gen; ; } child = ModRelation(); - children.add(child); +children.add(child); } - if (children.size() == 1) { - {if (true) return child;} - } else { - {if (true) return new CoordinationPattern(false, children, true);} - } +if (children.size() == 1) + {if ("" != null) return child;} + else + {if ("" != null) return new CoordinationPattern(false, children, true);} throw new Error("Missing return statement in function"); } - final public SemgrexPattern ModRelation() throws ParseException { - SemgrexPattern child; + final public SemgrexPattern ModRelation() throws ParseException {SemgrexPattern child; boolean startUnderNeg; - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case RELATION: case ALIGNRELN: case IDENTIFIER: - case 19: + case 19:{ child = RelChild(); break; - case 17: + } + case 17:{ jj_consume_token(17); - startUnderNeg = underNegation; +startUnderNeg = underNegation; underNegation = true; child = RelChild(); - underNegation = startUnderNeg; - child.negate(); +underNegation = startUnderNeg; +child.negate(); break; - case 18: + } + case 18:{ jj_consume_token(18); child = RelChild(); - child.makeOptional(); +child.makeOptional(); break; + } default: jj_la1[8] = jj_gen; jj_consume_token(-1); throw new ParseException(); } - {if (true) return child;} +{if ("" != null) return child;} throw new Error("Missing return statement in function"); } - final public SemgrexPattern RelChild() throws ParseException { - SemgrexPattern child; - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 19: + final public SemgrexPattern RelChild() throws ParseException {SemgrexPattern child; + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 19:{ jj_consume_token(19); child = RelationDisj(); jj_consume_token(20); break; + } case RELATION: case ALIGNRELN: - case IDENTIFIER: + case IDENTIFIER:{ child = Relation(); break; + } default: jj_la1[9] = jj_gen; jj_consume_token(-1); throw new ParseException(); } - {if (true) return child;} +{if ("" != null) return child;} throw new Error("Missing return statement in function"); } - final public SemgrexPattern Relation() throws ParseException { - GraphRelation reln; + final public SemgrexPattern Relation() throws ParseException {GraphRelation reln; Token rel = null; Token relnType = null; Token numArg = null; @@ -247,66 +256,74 @@ final public SemgrexPattern Relation() throws ParseException { Token name = null; SemgrexPattern node; boolean pC = false; - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case RELATION: - case IDENTIFIER: - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case IDENTIFIER: + case IDENTIFIER:{ + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case IDENTIFIER:{ numArg = jj_consume_token(IDENTIFIER); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 21: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 21:{ jj_consume_token(21); numArg2 = jj_consume_token(IDENTIFIER); break; + } default: jj_la1[10] = jj_gen; ; } break; + } default: jj_la1[11] = jj_gen; ; } rel = jj_consume_token(RELATION); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case IDENTIFIER: - case REGEX: - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case IDENTIFIER: + case REGEX:{ + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case IDENTIFIER:{ relnType = jj_consume_token(IDENTIFIER); break; - case REGEX: + } + case REGEX:{ relnType = jj_consume_token(REGEX); break; + } default: jj_la1[12] = jj_gen; jj_consume_token(-1); throw new ParseException(); } break; + } default: jj_la1[13] = jj_gen; ; } - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 22: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 22:{ jj_consume_token(22); name = jj_consume_token(IDENTIFIER); break; + } default: jj_la1[14] = jj_gen; ; } break; - case ALIGNRELN: + } + case ALIGNRELN:{ rel = jj_consume_token(ALIGNRELN); break; + } default: jj_la1[15] = jj_gen; jj_consume_token(-1); throw new ParseException(); } - if (numArg == null && numArg2 == null) { +if (numArg == null && numArg2 == null) { reln = GraphRelation.getRelation(rel != null ? rel.image : null, relnType != null ? relnType.image : null, name != null ? name.image : null); @@ -322,134 +339,138 @@ final public SemgrexPattern Relation() throws ParseException { Integer.parseInt(numArg2.image), name != null ? name.image : null); } - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 17: case 19: - case 23: + case 23:{ node = ModNode(reln); break; - case 13: + } + case 13:{ jj_consume_token(13); node = SubNode(reln); jj_consume_token(14); break; + } default: jj_la1[16] = jj_gen; jj_consume_token(-1); throw new ParseException(); } - {if (true) return node;} +{if ("" != null) return node;} throw new Error("Missing return statement in function"); } - final public SemgrexPattern NodeDisj(GraphRelation r) throws ParseException { - SemgrexPattern child; + final public SemgrexPattern NodeDisj(GraphRelation r) throws ParseException {SemgrexPattern child; List children = new ArrayList(); jj_consume_token(19); child = NodeConj(r); - children.add(child); +children.add(child); label_4: while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 15: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 15:{ ; break; + } default: jj_la1[17] = jj_gen; break label_4; } jj_consume_token(15); child = NodeConj(r); - children.add(child); +children.add(child); } jj_consume_token(20); - if (children.size() == 1) - {if (true) return child;} +if (children.size() == 1) + {if ("" != null) return child;} else - {if (true) return new CoordinationPattern(true, children, false);} + {if ("" != null) return new CoordinationPattern(true, children, false);} throw new Error("Missing return statement in function"); } - final public SemgrexPattern NodeConj(GraphRelation r) throws ParseException { - SemgrexPattern child; + final public SemgrexPattern NodeConj(GraphRelation r) throws ParseException {SemgrexPattern child; List children = new ArrayList(); child = ModNode(r); - children.add(child); +children.add(child); label_5: while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 16: case 17: case 19: - case 23: + case 23:{ ; break; + } default: jj_la1[18] = jj_gen; break label_5; } - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 16: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 16:{ jj_consume_token(16); break; + } default: jj_la1[19] = jj_gen; ; } child = ModNode(r); - children.add(child); +children.add(child); } - if (children.size() == 1) - {if (true) return child;} +if (children.size() == 1) + {if ("" != null) return child;} else - {if (true) return new CoordinationPattern(true, children, true);} + {if ("" != null) return new CoordinationPattern(true, children, true);} throw new Error("Missing return statement in function"); } - final public SemgrexPattern ModNode(GraphRelation r) throws ParseException { - SemgrexPattern child; + final public SemgrexPattern ModNode(GraphRelation r) throws ParseException {SemgrexPattern child; boolean startUnderNeg; - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case 19: - case 23: + case 23:{ child = Child(r); break; - case 17: + } + case 17:{ jj_consume_token(17); - startUnderNeg = underNodeNegation; +startUnderNeg = underNodeNegation; underNodeNegation = true; child = Child(r); - underNodeNegation = startUnderNeg; +underNodeNegation = startUnderNeg; break; + } default: jj_la1[20] = jj_gen; jj_consume_token(-1); throw new ParseException(); } - {if (true) return child;} +{if ("" != null) return child;} throw new Error("Missing return statement in function"); } - final public SemgrexPattern Child(GraphRelation r) throws ParseException { - SemgrexPattern child; - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 19: + final public SemgrexPattern Child(GraphRelation r) throws ParseException {SemgrexPattern child; + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 19:{ child = NodeDisj(r); break; - case 23: + } + case 23:{ child = Description(r); break; + } default: jj_la1[21] = jj_gen; jj_consume_token(-1); throw new ParseException(); } - {if (true) return child;} +{if ("" != null) return child;} throw new Error("Missing return statement in function"); } - final public NodePattern Description(GraphRelation r) throws ParseException { - Token name = null; + final public NodePattern Description(GraphRelation r) throws ParseException {Token name = null; boolean link = false; boolean isRoot = false; boolean isEmpty = false; @@ -458,29 +479,32 @@ final public NodePattern Description(GraphRelation r) throws ParseException { Map attributes = Generics.newHashMap(); NodePattern pat; jj_consume_token(23); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case IDENTIFIER: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case IDENTIFIER:{ attr = jj_consume_token(IDENTIFIER); jj_consume_token(12); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case IDENTIFIER: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case IDENTIFIER:{ value = jj_consume_token(IDENTIFIER); break; - case REGEX: + } + case REGEX:{ value = jj_consume_token(REGEX); break; + } default: jj_la1[22] = jj_gen; jj_consume_token(-1); throw new ParseException(); } - if (attr != null && value != null) attributes.put(attr.image, value.image); +if (attr != null && value != null) attributes.put(attr.image, value.image); label_6: while (true) { - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 24: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 24:{ ; break; + } default: jj_la1[23] = jj_gen; break label_6; @@ -488,46 +512,52 @@ final public NodePattern Description(GraphRelation r) throws ParseException { jj_consume_token(24); attr = jj_consume_token(IDENTIFIER); jj_consume_token(12); - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case IDENTIFIER: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case IDENTIFIER:{ value = jj_consume_token(IDENTIFIER); break; - case REGEX: + } + case REGEX:{ value = jj_consume_token(REGEX); break; + } default: jj_la1[24] = jj_gen; jj_consume_token(-1); throw new ParseException(); } - if (attr != null && value != null) attributes.put(attr.image, value.image); +if (attr != null && value != null) attributes.put(attr.image, value.image); } jj_consume_token(25); break; - case ROOT: + } + case ROOT:{ attr = jj_consume_token(ROOT); jj_consume_token(25); - isRoot = true; +isRoot = true; break; - case EMPTY: + } + case EMPTY:{ attr = jj_consume_token(EMPTY); jj_consume_token(25); - isEmpty = true; +isEmpty = true; break; - case 25: + } + case 25:{ jj_consume_token(25); break; + } default: jj_la1[25] = jj_gen; jj_consume_token(-1); throw new ParseException(); } - switch ((jj_ntk==-1)?jj_ntk():jj_ntk) { - case 22: + switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { + case 22:{ jj_consume_token(22); - link = true; +link = true; name = jj_consume_token(IDENTIFIER); - String nodeName = name.image; +String nodeName = name.image; if (underNegation) { if (!knownVariables.contains(nodeName)) { {if (true) throw new ParseException("Cannot add new variable names under negation. Node '" + nodeName + "' not seen before");} @@ -536,13 +566,14 @@ final public NodePattern Description(GraphRelation r) throws ParseException { knownVariables.add(nodeName); } break; + } default: jj_la1[26] = jj_gen; ; } - pat = new NodePattern(r, underNodeNegation, attributes, isRoot, isEmpty, name != null ? name.image : null); +pat = new NodePattern(r, underNodeNegation, attributes, isRoot, isEmpty, name != null ? name.image : null); if (link) pat.makeLink(); - {if (true) return pat;} + {if ("" != null) return pat;} throw new Error("Missing return statement in function"); } @@ -664,7 +695,7 @@ final public Token getToken(int index) { return t; } - private int jj_ntk() { + private int jj_ntk_f() { if ((jj_nt=token.next) == null) return (jj_ntk = (token.next=token_source.getNextToken()).kind); else diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.jj b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.jj index 63a0359d46..e02185102f 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.jj +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParser.jj @@ -37,9 +37,9 @@ SKIP: TOKEN: { - < RELATION: "<" | ">" | ">>" | "<<" | "==" | "$+" | "$-" | "$++" | "$--" | "." > + < RELATION: "<" | ">" | ">>" | "<<" | "==" > | < ALIGNRELN: "@" > -| < IDENTIFIER: (~[" ", "\n", "\r", "(", "/", "|", "@", "!", "#", "%", "&", ")", "=", "?", "[", "]", ">", "<", "~", ".", ",", "$", ":", ";", "{", "}", "+", "-"])+ > +| < IDENTIFIER: (~[" ", "\n", "\r", "(", "/", "|", "@", "!", "#", "%", "&", ")", "=", "?", "[", "]", ">", "<", "~", ".", ",", "$", ":", ";", "{", "}"])+ > | < NUMBER: ( ["0"-"9"] )+ > | < EMPTY: "#" > | < ROOT: "$" > diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java index 82c31ff583..70b4d4c2b4 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexParserTokenManager.java @@ -1,29 +1,28 @@ +/* SemgrexParserTokenManager.java */ /* Generated By:JavaCC: Do not edit this line. SemgrexParserTokenManager.java */ package edu.stanford.nlp.semgraph.semgrex; // all generated classes are in this package //imports +import java.io.StringReader; import java.util.*; import edu.stanford.nlp.util.Generics; /** Token Manager. */ -class SemgrexParserTokenManager implements SemgrexParserConstants -{ +@SuppressWarnings("unused")class SemgrexParserTokenManager implements SemgrexParserConstants { /** Debug output. */ public java.io.PrintStream debugStream = System.out; /** Set debug output. */ public void setDebugStream(java.io.PrintStream ds) { debugStream = ds; } -private final int jjStopStringLiteralDfa_0(int pos, long active0) -{ +private final int jjStopStringLiteralDfa_0(int pos, long active0){ switch (pos) { default : return -1; } } -private final int jjStartNfa_0(int pos, long active0) -{ +private final int jjStartNfa_0(int pos, long active0){ return jjMoveNfa_0(jjStopStringLiteralDfa_0(pos, active0), pos + 1); } private int jjStopAtPos(int pos, int kind) @@ -32,8 +31,7 @@ private int jjStopAtPos(int pos, int kind) jjmatchedPos = pos; return pos + 1; } -private int jjMoveStringLiteralDfa0_0() -{ +private int jjMoveStringLiteralDfa0_0(){ switch(curChar) { case 9: @@ -45,7 +43,7 @@ private int jjMoveStringLiteralDfa0_0() case 35: return jjStopAtPos(0, 8); case 36: - return jjStartNfaWithStates_0(0, 9, 17); + return jjStopAtPos(0, 9); case 38: return jjStopAtPos(0, 16); case 40: @@ -95,7 +93,7 @@ private int jjStartNfaWithStates_0(int pos, int kind, int state) private int jjMoveNfa_0(int startState, int curPos) { int startsAt = 0; - jjnewStateCnt = 19; + jjnewStateCnt = 14; int i = 1; jjstateSet[0] = startState; int kind = 0x7fffffff; @@ -111,50 +109,32 @@ private int jjMoveNfa_0(int startState, int curPos) switch(jjstateSet[--i]) { case 0: - if ((0x3ff0484ffffdbffL & l) != 0L) + if ((0x3ff2c84ffffdbffL & l) != 0L) { if (kind > 6) kind = 6; - jjCheckNAdd(7); + { jjCheckNAdd(7); } } - else if ((0x5000400000000000L & l) != 0L) + else if ((0x5000000000000000L & l) != 0L) { if (kind > 4) kind = 4; } - else if (curChar == 36) - jjCheckNAddStates(0, 3); else if (curChar == 47) - jjCheckNAddStates(4, 6); + { jjCheckNAddStates(0, 2); } else if (curChar == 61) jjstateSet[jjnewStateCnt++] = 5; if ((0x3ff000000000000L & l) != 0L) { if (kind > 7) kind = 7; - jjCheckNAdd(8); + { jjCheckNAdd(8); } } else if (curChar == 60) jjstateSet[jjnewStateCnt++] = 3; else if (curChar == 62) jjstateSet[jjnewStateCnt++] = 1; break; - case 17: - if (curChar == 45) - jjCheckNAdd(16); - else if (curChar == 43) - jjCheckNAdd(15); - if (curChar == 45) - { - if (kind > 4) - kind = 4; - } - else if (curChar == 43) - { - if (kind > 4) - kind = 4; - } - break; case 1: if (curChar == 62 && kind > 4) kind = 4; @@ -180,48 +160,32 @@ else if (curChar == 43) jjstateSet[jjnewStateCnt++] = 5; break; case 7: - if ((0x3ff0484ffffdbffL & l) == 0L) + if ((0x3ff2c84ffffdbffL & l) == 0L) break; if (kind > 6) kind = 6; - jjCheckNAdd(7); + { jjCheckNAdd(7); } break; case 8: if ((0x3ff000000000000L & l) == 0L) break; if (kind > 7) kind = 7; - jjCheckNAdd(8); + { jjCheckNAdd(8); } break; case 9: case 10: if (curChar == 47) - jjCheckNAddStates(4, 6); + { jjCheckNAddStates(0, 2); } break; case 12: if ((0xffff7fffffffdbffL & l) != 0L) - jjCheckNAddStates(4, 6); + { jjCheckNAddStates(0, 2); } break; case 13: if (curChar == 47 && kind > 10) kind = 10; break; - case 14: - if (curChar == 36) - jjCheckNAddStates(0, 3); - break; - case 15: - if (curChar == 43 && kind > 4) - kind = 4; - break; - case 16: - if (curChar == 45 && kind > 4) - kind = 4; - break; - case 18: - if (curChar == 45) - jjCheckNAdd(16); - break; default : break; } } while(i != startsAt); @@ -239,14 +203,14 @@ else if (curChar < 128) break; if (kind > 6) kind = 6; - jjCheckNAdd(7); + { jjCheckNAdd(7); } break; case 11: if (curChar == 92) jjstateSet[jjnewStateCnt++] = 10; break; case 12: - jjAddStates(4, 6); + { jjAddStates(0, 2); } break; default : break; } @@ -254,7 +218,7 @@ else if (curChar < 128) } else { - int hiByte = (int)(curChar >> 8); + int hiByte = (curChar >> 8); int i1 = hiByte >> 6; long l1 = 1L << (hiByte & 077); int i2 = (curChar & 0xff) >> 6; @@ -269,13 +233,13 @@ else if (curChar < 128) break; if (kind > 6) kind = 6; - jjCheckNAdd(7); + { jjCheckNAdd(7); } break; case 12: if (jjCanMove_0(hiByte, i1, i2, l1, l2)) - jjAddStates(4, 6); + { jjAddStates(0, 2); } break; - default : break; + default : if (i1 == 0 || l1 == 0 || i2 == 0 || l2 == 0) break; else break; } } while(i != startsAt); } @@ -286,14 +250,14 @@ else if (curChar < 128) kind = 0x7fffffff; } ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 19 - (jjnewStateCnt = startsAt))) + if ((i = jjnewStateCnt) == (startsAt = 14 - (jjnewStateCnt = startsAt))) return curPos; try { curChar = input_stream.readChar(); } catch(java.io.IOException e) { return curPos; } } } static final int[] jjnextStates = { - 15, 16, 17, 18, 11, 12, 13, + 11, 12, 13, }; private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2) { @@ -313,66 +277,6 @@ private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, lo "", null, null, null, null, "\100", null, null, "\43", "\44", null, "\12", "\72", "\50", "\51", "\174", "\46", "\41", "\77", "\133", "\135", "\54", "\75", "\173", "\73", "\175", }; - -/** Lexer state names. */ -public static final String[] lexStateNames = { - "DEFAULT", -}; -static final long[] jjtoToken = { - 0x3fffff1L, -}; -static final long[] jjtoSkip = { - 0xeL, -}; -protected SimpleCharStream input_stream; -private final int[] jjrounds = new int[19]; -private final int[] jjstateSet = new int[38]; -protected char curChar; -/** Constructor. */ -public SemgrexParserTokenManager(SimpleCharStream stream){ - if (SimpleCharStream.staticFlag) - throw new Error("ERROR: Cannot use a static CharStream class with a non-static lexical analyzer."); - input_stream = stream; -} - -/** Constructor. */ -public SemgrexParserTokenManager(SimpleCharStream stream, int lexState){ - this(stream); - SwitchTo(lexState); -} - -/** Reinitialise parser. */ -public void ReInit(SimpleCharStream stream) -{ - jjmatchedPos = jjnewStateCnt = 0; - curLexState = defaultLexState; - input_stream = stream; - ReInitRounds(); -} -private void ReInitRounds() -{ - int i; - jjround = 0x80000001; - for (i = 19; i-- > 0;) - jjrounds[i] = 0x80000000; -} - -/** Reinitialise parser. */ -public void ReInit(SimpleCharStream stream, int lexState) -{ - ReInit(stream); - SwitchTo(lexState); -} - -/** Switch to specified lex state. */ -public void SwitchTo(int lexState) -{ - if (lexState >= 1 || lexState < 0) - throw new TokenMgrError("Error: Ignoring invalid lexical state : " + lexState + ". State unchanged.", TokenMgrError.INVALID_LEXICAL_STATE); - else - curLexState = lexState; -} - protected Token jjFillToken() { final Token t; @@ -420,6 +324,7 @@ public Token getNextToken() catch(java.io.IOException e) { jjmatchedKind = 0; + jjmatchedPos = -1; matchedToken = jjFillToken(); return matchedToken; } @@ -496,4 +401,69 @@ private void jjCheckNAddStates(int start, int end) } while (start++ != end); } + /** Constructor. */ + public SemgrexParserTokenManager(SimpleCharStream stream){ + + if (SimpleCharStream.staticFlag) + throw new Error("ERROR: Cannot use a static CharStream class with a non-static lexical analyzer."); + + input_stream = stream; + } + + /** Constructor. */ + public SemgrexParserTokenManager (SimpleCharStream stream, int lexState){ + ReInit(stream); + SwitchTo(lexState); + } + + /** Reinitialise parser. */ + public void ReInit(SimpleCharStream stream) + { + jjmatchedPos = jjnewStateCnt = 0; + curLexState = defaultLexState; + input_stream = stream; + ReInitRounds(); + } + + private void ReInitRounds() + { + int i; + jjround = 0x80000001; + for (i = 14; i-- > 0;) + jjrounds[i] = 0x80000000; + } + + /** Reinitialise parser. */ + public void ReInit(SimpleCharStream stream, int lexState) + { + ReInit(stream); + SwitchTo(lexState); + } + + /** Switch to specified lex state. */ + public void SwitchTo(int lexState) + { + if (lexState >= 1 || lexState < 0) + throw new TokenMgrError("Error: Ignoring invalid lexical state : " + lexState + ". State unchanged.", TokenMgrError.INVALID_LEXICAL_STATE); + else + curLexState = lexState; + } + +/** Lexer state names. */ +public static final String[] lexStateNames = { + "DEFAULT", +}; +static final long[] jjtoToken = { + 0x3fffff1L, +}; +static final long[] jjtoSkip = { + 0xeL, +}; + protected SimpleCharStream input_stream; + + private final int[] jjrounds = new int[14]; + private final int[] jjstateSet = new int[2 * 14]; + + + protected char curChar; } diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.java b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.java index a02298f9e3..23e9e6c357 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.java @@ -50,17 +50,12 @@ * *
    SymbolMeaning *
    A <reln B A is the dependent of a relation reln with B - *
    A >reln B A is the governor of a relation reln with B + *
    A >reln B A is the governer of a relation reln with B *
    A <<reln B A is the dependent of a relation reln in a chain to B following dep->gov paths - *
    A >>reln B A is the governor of a relation reln in a chain to B following gov->dep paths + *
    A >>reln B A is the governer of a relation reln in a chain to B following gov->dep paths *
    A x,y<<reln B A is the dependent of a relation reln in a chain to B following dep->gov paths between distances of x and y - *
    A x,y>>reln B A is the governor of a relation reln in a chain to B following gov->dep paths between distances of x and y + *
    A x,y>>reln B A is the governer of a relation reln in a chain to B following gov->dep paths between distances of x and y *
    A == B A and B are the same nodes in the same graph - *
    A . B A is immediately precedes B, i.e. A.index() == B.index() - 1 - *
    A $+ B B is a right immediate sibling of A, i.e. A and B have the same parent and A.index() == B.index() - 1 - *
    A $- B B is a right immediate sibling of A, i.e. A and B have the same parent and A.index() == B.index() + 1 - *
    A $++ B B is a right sibling of A, i.e. A and B have the same parent and A.index() < B.index() - *
    A $-- B B is a left sibling of A, i.e. A and B have the same parent and A.index() > B.index() *
    A @ B A is aligned to B *
    *

    @@ -69,8 +64,8 @@ * node in the chain. For example, "{} >nsubj {} >dobj * {}" means "any node that is the governor of both a nsubj and * a dobj relation". If instead what you want is a node that is the - * governor of a nsubj relation with a node that is itself the - * governor of dobj relation, you should write: "{} >nsubj + * governer of a nsubj relation with a node that is itself the + * governer of dobj relation, you should write: "{} >nsubj * ({} >dobj {})".

    * * If a relation type is specified for the << relation, the diff --git a/src/edu/stanford/nlp/semgraph/semgrex/SimpleCharStream.java b/src/edu/stanford/nlp/semgraph/semgrex/SimpleCharStream.java index a4edcf187c..f0a0c3eaf2 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/SimpleCharStream.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/SimpleCharStream.java @@ -1,4 +1,4 @@ -/* Generated By:JavaCC: Do not edit this line. SimpleCharStream.java Version 5.0 */ +/* Generated By:JavaCC: Do not edit this line. SimpleCharStream.java Version 6.0 */ /* JavaCCOptions:STATIC=false,SUPPORT_CLASS_VISIBILITY_PUBLIC=false */ package edu.stanford.nlp.semgraph.semgrex; @@ -31,9 +31,10 @@ class SimpleCharStream protected int maxNextCharInd = 0; protected int inBuf = 0; protected int tabSize = 8; + protected boolean trackLineColumn = false; - protected void setTabSize(int i) { tabSize = i; } - protected int getTabSize(int i) { return tabSize; } + public void setTabSize(int i) { tabSize = i; } + public int getTabSize() { return tabSize; } protected void ExpandBuff(boolean wrapAround) @@ -467,5 +468,7 @@ public void adjustBeginLineColumn(int newLine, int newCol) column = bufcolumn[j]; } + boolean getTrackLineColumn() { return trackLineColumn; } + void setTrackLineColumn(boolean trackLineColumn) { this.trackLineColumn = trackLineColumn; } } -/* JavaCC - OriginalChecksum=3b59eb6b560cd6ccb364c9477f327b87 (do not edit this line) */ +/* JavaCC - OriginalChecksum=4090ca6173a52cbf54b493627a37dc22 (do not edit this line) */ diff --git a/src/edu/stanford/nlp/semgraph/semgrex/Token.java b/src/edu/stanford/nlp/semgraph/semgrex/Token.java index 07118bbf7d..221fc57837 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/Token.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/Token.java @@ -1,4 +1,4 @@ -/* Generated By:JavaCC: Do not edit this line. Token.java Version 5.0 */ +/* Generated By:JavaCC: Do not edit this line. Token.java Version 6.0 */ /* JavaCCOptions:TOKEN_EXTENDS=,KEEP_LINE_COL=null,SUPPORT_CLASS_VISIBILITY_PUBLIC=false */ package edu.stanford.nlp.semgraph.semgrex; @@ -128,4 +128,4 @@ public static Token newToken(int ofKind) } } -/* JavaCC - OriginalChecksum=5aa055b7d29a7cb18008be851f72d5d0 (do not edit this line) */ +/* JavaCC - OriginalChecksum=46183f8d5bbdfa33f15887575ea3e7b5 (do not edit this line) */ diff --git a/src/edu/stanford/nlp/semgraph/semgrex/TokenMgrError.java b/src/edu/stanford/nlp/semgraph/semgrex/TokenMgrError.java index c832e773e6..74d878cffd 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/TokenMgrError.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/TokenMgrError.java @@ -1,4 +1,4 @@ -/* Generated By:JavaCC: Do not edit this line. TokenMgrError.java Version 5.0 */ +/* Generated By:JavaCC: Do not edit this line. TokenMgrError.java Version 6.0 */ /* JavaCCOptions: */ package edu.stanford.nlp.semgraph.semgrex; @@ -144,4 +144,4 @@ public TokenMgrError(boolean EOFSeen, int lexState, int errorLine, int errorColu this(LexicalError(EOFSeen, lexState, errorLine, errorColumn, errorAfter, curChar), reason); } } -/* JavaCC - OriginalChecksum=dae3bfd0114ba8f347e2201a60a73374 (do not edit this line) */ +/* JavaCC - OriginalChecksum=550e3452e7d6c690ac97c1a3584842b3 (do not edit this line) */ diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/CollapseSubtree.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/CollapseSubtree.java deleted file mode 100644 index 1ceee2fcc4..0000000000 --- a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/CollapseSubtree.java +++ /dev/null @@ -1,96 +0,0 @@ -package edu.stanford.nlp.semgraph.semgrex.ssurgeon; - -import java.io.StringWriter; -import java.util.Collections; -import java.util.List; -import java.util.Set; - -import edu.stanford.nlp.ling.IndexedWord; -import edu.stanford.nlp.semgraph.SemanticGraph; -import edu.stanford.nlp.semgraph.SemanticGraphEdge; -import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher; -import edu.stanford.nlp.util.Generics; -import edu.stanford.nlp.util.StringUtils; - -/** - * - * Collapses a subtree into a single node. - * The new node has the POS tag and index of the root node - * and the value and the lemma of the concatenation of the subnodes. - * - * One intended use is to collapse multi-word expressions into one node - * to facilitate relation extraction and related tasks. - * - * @author Sebastian Schuster - * - */ - -public class CollapseSubtree extends SsurgeonEdit { - - public static final String LABEL="collapseSubtree"; - protected String rootName; // Name of the root node in match - - - public CollapseSubtree(String rootNodeName) { - this.rootName = rootNodeName; - } - - - @SuppressWarnings({ "unchecked", "rawtypes" }) - @Override - public void evaluate(SemanticGraph sg, SemgrexMatcher sm) { - - IndexedWord rootNode = this.getNamedNode(rootName, sm); - Set subgraphNodeSet = sg.getSubgraphVertices(rootNode); - - - if ( ! sg.isDag(rootNode)) { - /* Check if there is a cycle going back to the root. */ - for (IndexedWord child : sg.getChildren(rootNode)) { - Set reachableSet = sg.getSubgraphVertices(child); - if (reachableSet.contains(rootNode)) { - throw new IllegalArgumentException("Subtree cannot contain cycle leading back to root node!"); - } - } - } - - List sortedSubgraphNodes = Generics.newArrayList(subgraphNodeSet); - Collections.sort(sortedSubgraphNodes); - - IndexedWord newNode = new IndexedWord(rootNode.docID(), rootNode.sentIndex(), rootNode.index()); - /* Copy all attributes from rootNode. */ - for (Class key : newNode.backingLabel().keySet()) { - newNode.set(key, rootNode.get(key)); - } - - newNode.setValue(StringUtils.join(sortedSubgraphNodes.stream().map(IndexedWord::value), " ")); - newNode.setWord(StringUtils.join(sortedSubgraphNodes.stream().map(IndexedWord::word), " ")); - newNode.setLemma(StringUtils.join(sortedSubgraphNodes.stream().map(x -> x.lemma() == null ? x.word() : x.lemma()), " ")); - - if (sg.getRoots().contains(rootNode)) { - sg.getRoots().remove(rootNode); - sg.addRoot(rootNode); - } - - for (SemanticGraphEdge edge : sg.incomingEdgeIterable(rootNode)) { - sg.addEdge(edge.getGovernor(), newNode, edge.getRelation(), edge.getWeight(), edge.isExtra()); - } - - for (IndexedWord node : sortedSubgraphNodes) { - sg.removeVertex(node); - } - - } - - @Override - public String toEditString() { - StringWriter buf = new StringWriter(); - buf.write(LABEL); buf.write("\t"); - buf.write(Ssurgeon.NODENAME_ARG);buf.write(" "); - buf.write(rootName); - return buf.toString(); - } - - - -} diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java index 900ee4bab6..200b838ac6 100644 --- a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java +++ b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java @@ -62,15 +62,15 @@ public void initLog(File logFilePath) throws IOException { log.addHandler(fh); log.setLevel(Level.FINE); fh.setFormatter(new NewlineLogFormatter()); - + System.out.println("Starting Ssurgeon log, at "+logFilePath.getAbsolutePath()+" date=" + DateFormat.getDateInstance(DateFormat.FULL).format(new Date())); log.info("Starting Ssurgeon log, date=" + DateFormat.getDateInstance(DateFormat.FULL).format(new Date())); } - + public void setLogPrefix(String logPrefix) { this.logPrefix = logPrefix; } - + /** @@ -89,7 +89,7 @@ public List expandFromPatterns(List patternList System.out.println("\ncompact = "+orderedGraph.toCompactString()); System.out.println("regular="+orderedGraph.toString()); } - + if (generated.size() > 0) { if (log != null) { log.info("* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *"); @@ -112,7 +112,7 @@ public List expandFromPatterns(List patternList } return retList; } - + /** * Similar to the expandFromPatterns, but performs an exhaustive * search, performing simplifications on the graphs until exhausted. @@ -142,7 +142,7 @@ private List exhaustFromPatterns(List patternLi //modGraph.edgeList(true); retList.add(modGraph); } - + if (log != null && generated.size() > 0) { log.info("* * * * * * * * * ** * * * * * * * * *"); log.info("Exhaust from patterns, depth="+depth); @@ -157,7 +157,7 @@ private List exhaustFromPatterns(List patternLi } } } - + if (retList.size() > 0) { List referenceList = new ArrayList(); referenceList.addAll(retList); @@ -168,8 +168,8 @@ private List exhaustFromPatterns(List patternLi } return retList; } - - + + /** * Given a path to a file, converts it into a SsurgeonPattern * TODO: finish implementing this stub. @@ -177,34 +177,34 @@ private List exhaustFromPatterns(List patternLi public static SsurgeonPattern getOperationFromFile(String path) { return null; } - + // // Resource management // private Map wordListResources = Generics.newHashMap(); - + /** - * Places the given word list resource under the given ID. + * Places the given word list resource under the given ID. * Note: can overwrite existing one in place. * */ protected void addResource(SsurgeonWordlist resource) { wordListResources.put(resource.getID(), resource); } - + /** - * Returns the given resource with the id. + * Returns the given resource with the id. * If does not exist, will throw exception. */ public SsurgeonWordlist getResource(String id) { return wordListResources.get(id); } - + public Collection getResources() { return wordListResources.values(); } - - + + public static final String GOV_NODENAME_ARG = "-gov"; public static final String DEP_NODENAME_ARG = "-dep"; public static final String EDGE_NAME_ARG = "-edge"; @@ -213,16 +213,16 @@ public Collection getResources() { public static final String NODE_PROTO_ARG = "-nodearg"; public static final String WEIGHT_ARG = "-weight"; public static final String NAME_ARG = "-name"; - - + + // We use the RA arg extractor to parse args for Ssurgeon edits, allowing us to not // worry about arg order (and to make things appear less confusing) protected static class SsurgeonArgs { // Below are values keyed by Semgrex name public String govNodeName = null; - + public String dep = null; - + public String edge = null; public String reln = null; @@ -260,7 +260,7 @@ public static String[] parseArgs(String argsString) { } return retList.toArray(new String[0]); } - + /** * Given a string entry, converts it into a SsurgeonEdit object. */ @@ -313,8 +313,8 @@ public static SsurgeonEdit parseEditLine(String editLine) { throw new IllegalArgumentException("Parsing Ssurgeon args: unknown flag " + argsArray[argIndex]); } } - - + + // Parse the arguments based upon the type of command to execute. // TODO: this logic really should be moved into the individual classes. The string-->class // mappings should also be stored in more appropriate data structure. @@ -343,13 +343,13 @@ public static SsurgeonEdit parseEditLine(String editLine) { } return retEdit; } - + //public static SsurgeonPattern fromXML(String xmlString) throws Exception { //SAXBuilder builder = new SAXBuilder(); //Document jdomDoc = builder.build(xmlString); //jdomDoc.getRootElement().getChildren(SsurgeonPattern.SSURGEON_ELEM_TAG); //} - + /** * Given a target filepath and a list of Ssurgeon patterns, writes them out as XML forms. */ @@ -367,7 +367,7 @@ public static void writeToFile(File tgtFile, List patterns) { log.throwing(Ssurgeon.class.getName(), "writeToFile", e); } } - + public static String writeToString(SsurgeonPattern pattern) { try { List patterns = new LinkedList(); @@ -389,8 +389,8 @@ public static String writeToString(SsurgeonPattern pattern) { } return ""; } - - + + private static Document createPatternXMLDoc(List patterns) { try { DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); @@ -411,7 +411,7 @@ private static Document createPatternXMLDoc(List patterns) { Element notesElem = domDoc.createElement(SsurgeonPattern.NOTES_ELEM_TAG); notesElem.appendChild(domDoc.createTextNode(pattern.getNotes())); patElt.appendChild(notesElem); - + SemanticGraph semgrexGraph = pattern.getSemgrexGraph(); if (semgrexGraph != null) { Element patNode = domDoc.createElement(SsurgeonPattern.SEMGREX_GRAPH_ELEM_TAG); @@ -436,7 +436,7 @@ private static Document createPatternXMLDoc(List patterns) { return null; } } - + /** * Given a path to a file containing a list of SsurgeonPatterns, returns @@ -448,10 +448,10 @@ private static Document createPatternXMLDoc(List patterns) { public List readFromFile(File file) throws Exception { List retList = new ArrayList(); Document doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(file); - + if (VERBOSE) System.out.println("Reading ssurgeon file="+file.getAbsolutePath()); - + NodeList patternNodes = doc.getElementsByTagName(SsurgeonPattern.SSURGEON_ELEM_TAG); for (int i=0; i readFromFile(File file) throws Exception { retList.add(pattern); } } - + NodeList resourceNodes = doc.getElementsByTagName(SsurgeonPattern.RESOURCE_TAG); for (int i=0; i < resourceNodes.getLength(); i++) { Node node = patternNodes.item(i); @@ -471,10 +471,10 @@ public List readFromFile(File file) throws Exception { addResource(wlRsrc); } } - + return retList; } - + /** * Reads all Ssurgeon patterns from file. * @throws Exception @@ -494,7 +494,7 @@ public List readFromDirectory(File dir) throws Exception { } return patterns; } - + /** * Given the root Element for a SemgrexPattern (SSURGEON_ELEM_TAG), converts * it into its corresponding SemgrexPattern object. @@ -565,32 +565,32 @@ public static SsurgPred assemblePredFromXML(Element elt) throws Exception { } return new WordlistTest(id, resourceID, typeStr, matchName); } - + // Not a valid node, error out! throw new Exception("Invalid node encountered during Ssurgeon predicate processing, node name="+eltName); } - - - + + + /** * Reads in the test file and prints readable to string (for debugging). * Input file consists of semantic graphs, in compact form. */ public void testRead(File tgtDirPath) throws Exception { List patterns = readFromDirectory(tgtDirPath); - + System.out.println("Patterns, num="+patterns.size()); int num = 1; for (SsurgeonPattern pattern : patterns) { System.out.println("\n# "+(num++)); System.out.println(pattern); } - + System.out.println("\n\nRESOURCES "); for (SsurgeonWordlist rsrc : inst().getResources()) { System.out.println(rsrc+"* * * * *"); } - + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); String line; boolean runFlag = true; @@ -618,12 +618,12 @@ public void testRead(File tgtDirPath) throws Exception { } } } - - + + /* * XML convenience routines */ - + /** * For the given element, returns the text for the first child Element with * the given tag. @@ -640,7 +640,7 @@ public static String getTagText(Element element, String tag) { } return ""; } - + /** * For a given Element, treats the first child as a text element * and returns its value. @@ -655,7 +655,7 @@ public static String getEltText(Element element) { } return ""; } - + /** * For the given element, finds the first child Element with the given tag. */ @@ -673,7 +673,7 @@ public static Element getFirstTag(Element element, String tag) { } return null; } - + /** * Returns the first child whose node type is Element under the given Element. */ @@ -690,8 +690,8 @@ public static Element getFirstChildElement(Element element) { } return null; } - - + + /** * Returns all of the Element typed children from the given element. Note: disregards * other node types. @@ -711,33 +711,33 @@ public static List getChildElements(Element element) { } return childElements; } - + /* * Main class evocation stuff */ - - + + public static enum RUNTYPE { interactive, // interactively test contents of pattern directory against entered sentences testinfo // test against a given infofile (RTE), generating rewrites for hypotheses } - - + + public static class ArgsBox { public RUNTYPE type = RUNTYPE.interactive; - + public String patternDirStr = null; public File patternDir = null; - + public String info = null; public File infoPath = null; - + public void init() { patternDir = new File(patternDirStr); if (type == RUNTYPE.testinfo) infoPath = new File(info); } - + @Override public String toString() { StringWriter buf = new StringWriter(); @@ -751,9 +751,9 @@ public String toString() { return buf.toString(); } } - + protected static ArgsBox argsBox = new ArgsBox(); - + /** * Performs a simple test and print of a given file */ diff --git a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonUtils.java b/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonUtils.java deleted file mode 100644 index 49c581762f..0000000000 --- a/src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/SsurgeonUtils.java +++ /dev/null @@ -1,9 +0,0 @@ -package edu.stanford.nlp.semgraph.semgrex.ssurgeon; - -import edu.stanford.nlp.ling.IndexedWord; -import edu.stanford.nlp.semgraph.SemanticGraph; - -public class SsurgeonUtils { - - -} diff --git a/src/edu/stanford/nlp/sequences/DocumentReaderAndWriter.java b/src/edu/stanford/nlp/sequences/DocumentReaderAndWriter.java index 53b7330e30..ee09526982 100644 --- a/src/edu/stanford/nlp/sequences/DocumentReaderAndWriter.java +++ b/src/edu/stanford/nlp/sequences/DocumentReaderAndWriter.java @@ -13,7 +13,7 @@ * If you subclass this interface, all of the other * mechanisms necessary for getting your data into a * sequence classifier will be taken care of - * for you. Subclasses MUST have an empty constructor so + * for you. Subclasses MUST have an empty constructor as * they can be instantiated by reflection, and * there is a promise that the init method will * be called immediately after construction. @@ -31,7 +31,7 @@ public interface DocumentReaderAndWriter * * @param flags Flags specifying behavior */ - void init(SeqClassifierFlags flags); + public void init(SeqClassifierFlags flags); /** * This method prints the output of the classifier to a @@ -40,6 +40,6 @@ public interface DocumentReaderAndWriter * @param doc The document which has answers (it has been classified) * @param out Where to send the output */ - void printAnswers(List doc, PrintWriter out); + public void printAnswers(List doc, PrintWriter out); } diff --git a/src/edu/stanford/nlp/time/TimeExpression.java b/src/edu/stanford/nlp/time/TimeExpression.java index 04564d73c6..bc7052dcd8 100644 --- a/src/edu/stanford/nlp/time/TimeExpression.java +++ b/src/edu/stanford/nlp/time/TimeExpression.java @@ -6,7 +6,6 @@ import edu.stanford.nlp.ling.tokensregex.MatchedExpression; import edu.stanford.nlp.ling.tokensregex.types.Expressions; import edu.stanford.nlp.ling.tokensregex.types.Value; -import edu.stanford.nlp.pipeline.CoreMapAggregator; import edu.stanford.nlp.pipeline.CoreMapAttributeAggregator; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.ErasureUtils; @@ -90,7 +89,7 @@ public Value apply(CoreMap in) { extractFunc.resultAnnotationField = Collections.singletonList((Class) TimeExpression.Annotation.class); extractFunc.resultNestedAnnotationField = TimeExpression.ChildrenAnnotation.class; extractFunc.resultAnnotationExtractor = TimeExpressionConverter; - extractFunc.tokensAggregator = CoreMapAggregator.DEFAULT_NUMERIC_TOKENS_AGGREGATOR; + extractFunc.tokensAggregators = CoreMapAttributeAggregator.DEFAULT_NUMERIC_TOKENS_AGGREGATORS; return extractFunc; } diff --git a/src/edu/stanford/nlp/time/TimeFormatter.java b/src/edu/stanford/nlp/time/TimeFormatter.java index 3d92827689..01327ad144 100644 --- a/src/edu/stanford/nlp/time/TimeFormatter.java +++ b/src/edu/stanford/nlp/time/TimeFormatter.java @@ -103,16 +103,16 @@ protected void updateExtractRule(SequenceMatchRules.AnnotationExtractRule r, Pattern pattern, Function extractor) { - MatchedExpression.SingleAnnotationExtractor annotationExtractor = SequenceMatchRules.createAnnotationExtractor(env,r); - annotationExtractor.valueExtractor = + MatchedExpression.SingleAnnotationExtractor valueExtractor = SequenceMatchRules.createAnnotationExtractor(env,r); + valueExtractor.valueExtractor = new SequenceMatchRules.CoreMapFunctionApplier< String, Value>( env, r.annotationField, extractor); r.extractRule = new SequenceMatchRules.CoreMapExtractRule< String, MatchedExpression >( env, r.annotationField, new SequenceMatchRules.StringPatternExtractRule(pattern, - new SequenceMatchRules.StringMatchedExpressionExtractor( annotationExtractor, r.matchedExpressionGroup))); - r.filterRule = new SequenceMatchRules.AnnotationMatchedFilter(annotationExtractor); + new SequenceMatchRules.StringMatchedExpressionExtractor( valueExtractor, r.matchedExpressionGroup))); + r.filterRule = new SequenceMatchRules.AnnotationMatchedFilter(valueExtractor); r.pattern = pattern; } @@ -120,12 +120,12 @@ protected void updateExtractRule(SequenceMatchRules.AnnotationExtractRule r, Env env, Function extractor) { - MatchedExpression.SingleAnnotationExtractor annotationExtractor = SequenceMatchRules.createAnnotationExtractor(env,r); - annotationExtractor.valueExtractor = extractor; + MatchedExpression.SingleAnnotationExtractor valueExtractor = SequenceMatchRules.createAnnotationExtractor(env,r); + valueExtractor.valueExtractor = extractor; r.extractRule = new SequenceMatchRules.CoreMapExtractRule, MatchedExpression >( env, r.annotationField, - new SequenceMatchRules.BasicSequenceExtractRule(annotationExtractor)); - r.filterRule = new SequenceMatchRules.AnnotationMatchedFilter(annotationExtractor); + new SequenceMatchRules.BasicSequenceExtractRule(valueExtractor)); + r.filterRule = new SequenceMatchRules.AnnotationMatchedFilter(valueExtractor); } @Override diff --git a/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java b/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java index 2cdc5af5c1..47c8f0a645 100644 --- a/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java +++ b/src/edu/stanford/nlp/trees/UniversalEnglishGrammaticalRelations.java @@ -1549,7 +1549,7 @@ public static Lock valuesLock() { * access. * @param relation the relation to be added to the values list */ - public static void threadSafeAddRelation(GrammaticalRelation relation) { + private static void threadSafeAddRelation(GrammaticalRelation relation) { valuesLock.writeLock().lock(); try { // try-finally structure taken from Javadoc code sample for ReentrantReadWriteLock synchronizedValues.add(relation); diff --git a/src/edu/stanford/nlp/trees/international/pennchinese/SunJurafskyChineseHeadFinder.java b/src/edu/stanford/nlp/trees/international/pennchinese/SunJurafskyChineseHeadFinder.java index d8118a1dc8..fb9819db82 100644 --- a/src/edu/stanford/nlp/trees/international/pennchinese/SunJurafskyChineseHeadFinder.java +++ b/src/edu/stanford/nlp/trees/international/pennchinese/SunJurafskyChineseHeadFinder.java @@ -5,13 +5,16 @@ import edu.stanford.nlp.util.Generics; /** - * A HeadFinder for Chinese based on rules described in Sun/Jurafsky NAACL 2004. + * A headfinder for Chinese based on rules described in Sun/Jurafsky NAACL '04. * * @author Galen Andrew * @version Jul 12, 2004 */ public class SunJurafskyChineseHeadFinder extends AbstractCollinsHeadFinder { + /** + * + */ private static final long serialVersionUID = -7942375587642755210L; public SunJurafskyChineseHeadFinder() { @@ -66,32 +69,4 @@ public SunJurafskyChineseHeadFinder(TreebankLanguagePack tlp) { nonTerminalInfo.put("VE", new String[][]{{"left"}}); } - /* Yue Zhang and Stephen Clark 2008 based their rules on Sun/Jurafsky but changed a few things. - Constituent Rules - ADJP r ADJP JJ AD; r - ADVP r ADVP AD CS JJ NP PP P VA VV; r - CLP r CLP M NN NP; r - CP r CP IP VP; r - DNP r DEG DNP DEC QP; r - DP r M; l DP DT OD; l - DVP r DEV AD VP; r - FRAG r VV NR NN NT; r - IP r VP IP NP; r - LCP r LCP LC; r - LST r CD NP QP; r - NP r NP NN IP NR NT; r - NN r NP NN IP NR NT; r - PP l P PP; l - PRN l PU; l - QP r QP CLP CD; r - UCP l IP NP VP; l - VCD l VV VA VE; l - VP l VE VC VV VNV VPT VRD VSB - VCD VP; l - VPT l VA VV; l - VRD l VVI VA; l - VSB r VV VE; r - default r - */ - } diff --git a/src/edu/stanford/nlp/util/HasInterval.java b/src/edu/stanford/nlp/util/HasInterval.java index c2e63d5fe0..722aba5fe4 100644 --- a/src/edu/stanford/nlp/util/HasInterval.java +++ b/src/edu/stanford/nlp/util/HasInterval.java @@ -14,7 +14,7 @@ public interface HasInterval> { */ public Interval getInterval(); - public final static Comparator> LENGTH_GT_COMPARATOR = + public final static Comparator> LENGTH_COMPARATOR = (e1, e2) -> { int len1 = e1.getInterval().getEnd() - e1.getInterval().getBegin(); int len2 = e2.getInterval().getEnd() - e2.getInterval().getBegin(); @@ -25,17 +25,6 @@ public interface HasInterval> { } }; - public final static Comparator> LENGTH_LT_COMPARATOR = - (e1, e2) -> { - int len1 = e1.getInterval().getEnd() - e1.getInterval().getBegin(); - int len2 = e2.getInterval().getEnd() - e2.getInterval().getBegin(); - if (len1 == len2) { - return 0; - } else { - return (len1 < len2)? -1:1; - } - }; - public final static Comparator ENDPOINTS_COMPARATOR = (e1, e2) -> (e1.getInterval().compareTo(e2.getInterval())); @@ -64,6 +53,6 @@ public interface HasInterval> { }; public final static Comparator> LENGTH_ENDPOINTS_COMPARATOR = - Comparators.chain(HasInterval.LENGTH_GT_COMPARATOR, HasInterval.ENDPOINTS_COMPARATOR); + Comparators.chain(HasInterval.LENGTH_COMPARATOR, HasInterval.ENDPOINTS_COMPARATOR); } diff --git a/test/src/edu/stanford/nlp/ling/tokensregex/SequencePatternTriggerTest.java b/test/src/edu/stanford/nlp/ling/tokensregex/SequencePatternTriggerTest.java deleted file mode 100644 index 7c2f78d199..0000000000 --- a/test/src/edu/stanford/nlp/ling/tokensregex/SequencePatternTriggerTest.java +++ /dev/null @@ -1,180 +0,0 @@ -package edu.stanford.nlp.ling.tokensregex; - -import edu.stanford.nlp.ling.Sentence; -import edu.stanford.nlp.util.CoreMap; -import junit.framework.TestCase; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** - * Tests triggering of sequence patterns - * - * @author Angel Chang - */ -public class SequencePatternTriggerTest extends TestCase { - - public void testSimpleTrigger() throws Exception { - List patterns = new ArrayList(); - patterns.add(TokenSequencePattern.compile("which word should be matched")); - - MultiPatternMatcher.SequencePatternTrigger trigger = - new MultiPatternMatcher.BasicSequencePatternTrigger( - new CoreMapNodePatternTrigger(patterns)); - - Collection> triggered = trigger.apply(Sentence.toCoreLabelList("one", "two", "three")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "should", "be", "matched")); - assertEquals(1, triggered.size()); - } - - public void testOptionalTrigger() throws Exception { - List patterns = new ArrayList(); - patterns.add(TokenSequencePattern.compile("which word should? be matched")); - - MultiPatternMatcher.SequencePatternTrigger trigger = - new MultiPatternMatcher.BasicSequencePatternTrigger( - new CoreMapNodePatternTrigger(patterns)); - - Collection> triggered = trigger.apply(Sentence.toCoreLabelList("one", "two", "three")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("matched")); - assertEquals(1, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("should")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "be")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "be", "matched")); - assertEquals(1, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "should", "be", "matched")); - assertEquals(1, triggered.size()); - } - - public void testOptionalTrigger2() throws Exception { - List patterns = new ArrayList(); - patterns.add(TokenSequencePattern.compile("which word should? be matched?")); - - MultiPatternMatcher.SequencePatternTrigger trigger = - new MultiPatternMatcher.BasicSequencePatternTrigger( - new CoreMapNodePatternTrigger(patterns)); - - Collection> triggered = trigger.apply(Sentence.toCoreLabelList("one", "two", "three")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which")); - assertEquals(1, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("matched")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("should")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "be")); - assertEquals(1, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "should", "be", "matched")); - assertEquals(1, triggered.size()); - } - - public void testOptionalTrigger3() throws Exception { - List patterns = new ArrayList(); - patterns.add(TokenSequencePattern.compile("which word ( should | would ) be matched?")); - - MultiPatternMatcher.SequencePatternTrigger trigger = - new MultiPatternMatcher.BasicSequencePatternTrigger( - new CoreMapNodePatternTrigger(patterns)); - - Collection> triggered = trigger.apply(Sentence.toCoreLabelList("one", "two", "three")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which")); - assertEquals(1, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("matched")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("should")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "be")); - assertEquals(1, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "should", "be", "matched")); - assertEquals(1, triggered.size()); - } - - public void testOptionalTrigger4() throws Exception { - List patterns = new ArrayList(); - patterns.add(TokenSequencePattern.compile("which word should? be matched{1,2}")); - - MultiPatternMatcher.SequencePatternTrigger trigger = - new MultiPatternMatcher.BasicSequencePatternTrigger( - new CoreMapNodePatternTrigger(patterns)); - - Collection> triggered = trigger.apply(Sentence.toCoreLabelList("one", "two", "three")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("matched")); - assertEquals(1, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("should")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "be")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "be", "matched")); - assertEquals(1, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "should", "be", "matched")); - assertEquals(1, triggered.size()); - } - - public void testOptionalTrigger5() throws Exception { - List patterns = new ArrayList(); - patterns.add(TokenSequencePattern.compile("which word should? be matched{1,8}")); - - MultiPatternMatcher.SequencePatternTrigger trigger = - new MultiPatternMatcher.BasicSequencePatternTrigger( - new CoreMapNodePatternTrigger(patterns)); - - Collection> triggered = trigger.apply(Sentence.toCoreLabelList("one", "two", "three")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which")); - assertEquals(1, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("matched")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("should")); - assertEquals(0, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "be")); - assertEquals(1, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "be", "matched")); - assertEquals(1, triggered.size()); - - triggered = trigger.apply(Sentence.toCoreLabelList("which", "word", "should", "be", "matched")); - assertEquals(1, triggered.size()); - } - -} diff --git a/itest/src/edu/stanford/nlp/loglinear/CoNLLBenchmark.java b/test/src/edu/stanford/nlp/loglinear/learning/CoNLLBenchmark.java similarity index 98% rename from itest/src/edu/stanford/nlp/loglinear/CoNLLBenchmark.java rename to test/src/edu/stanford/nlp/loglinear/learning/CoNLLBenchmark.java index 8e80d7ec81..aab497183e 100644 --- a/itest/src/edu/stanford/nlp/loglinear/CoNLLBenchmark.java +++ b/test/src/edu/stanford/nlp/loglinear/learning/CoNLLBenchmark.java @@ -1,9 +1,6 @@ -package edu.stanford.nlp.loglinear; +package edu.stanford.nlp.loglinear.learning; import edu.stanford.nlp.loglinear.inference.CliqueTree; -import edu.stanford.nlp.loglinear.learning.AbstractBatchOptimizer; -import edu.stanford.nlp.loglinear.learning.BacktrackingAdaGradOptimizer; -import edu.stanford.nlp.loglinear.learning.LogLikelihoodFunction; import edu.stanford.nlp.loglinear.model.ConcatVector; import edu.stanford.nlp.loglinear.model.GraphicalModel; import edu.stanford.nlp.util.HashIndex; diff --git a/test/src/edu/stanford/nlp/pipeline/JSONOutputterTest.java b/test/src/edu/stanford/nlp/pipeline/JSONOutputterTest.java index 5ddb239e2c..15e699d895 100644 --- a/test/src/edu/stanford/nlp/pipeline/JSONOutputterTest.java +++ b/test/src/edu/stanford/nlp/pipeline/JSONOutputterTest.java @@ -98,33 +98,25 @@ public void testSimpleDocument() throws IOException { "\t\t\t\t\t\"index\": 1,\n" + "\t\t\t\t\t\"word\": \"JSON\",\n" + "\t\t\t\t\t\"characterOffsetBegin\": 0,\n" + - "\t\t\t\t\t\"characterOffsetEnd\": 4,\n" + - "\t\t\t\t\t\"before\": \"\",\n" + - "\t\t\t\t\t\"after\": \" \"\n" + + "\t\t\t\t\t\"characterOffsetEnd\": 4\n" + "\t\t\t\t},\n" + "\t\t\t\t{\n" + "\t\t\t\t\t\"index\": 2,\n" + "\t\t\t\t\t\"word\": \"is\",\n" + "\t\t\t\t\t\"characterOffsetBegin\": 5,\n" + - "\t\t\t\t\t\"characterOffsetEnd\": 7,\n" + - "\t\t\t\t\t\"before\": \" \",\n" + - "\t\t\t\t\t\"after\": \" \"\n" + + "\t\t\t\t\t\"characterOffsetEnd\": 7\n" + "\t\t\t\t},\n" + "\t\t\t\t{\n" + "\t\t\t\t\t\"index\": 3,\n" + "\t\t\t\t\t\"word\": \"neat\",\n" + "\t\t\t\t\t\"characterOffsetBegin\": 8,\n" + - "\t\t\t\t\t\"characterOffsetEnd\": 12,\n" + - "\t\t\t\t\t\"before\": \" \",\n" + - "\t\t\t\t\t\"after\": \"\"\n" + + "\t\t\t\t\t\"characterOffsetEnd\": 12\n" + "\t\t\t\t},\n" + "\t\t\t\t{\n" + "\t\t\t\t\t\"index\": 4,\n" + "\t\t\t\t\t\"word\": \".\",\n" + "\t\t\t\t\t\"characterOffsetBegin\": 12,\n" + - "\t\t\t\t\t\"characterOffsetEnd\": 13,\n" + - "\t\t\t\t\t\"before\": \"\",\n" + - "\t\t\t\t\t\"after\": \" \"\n" + + "\t\t\t\t\t\"characterOffsetEnd\": 13\n" + "\t\t\t\t}\n" + "\t\t\t]\n" + "\t\t},\n" + @@ -136,33 +128,25 @@ public void testSimpleDocument() throws IOException { "\t\t\t\t\t\"index\": 1,\n" + "\t\t\t\t\t\"word\": \"Better\",\n" + "\t\t\t\t\t\"characterOffsetBegin\": 14,\n" + - "\t\t\t\t\t\"characterOffsetEnd\": 20,\n" + - "\t\t\t\t\t\"before\": \" \",\n" + - "\t\t\t\t\t\"after\": \" \"\n" + + "\t\t\t\t\t\"characterOffsetEnd\": 20\n" + "\t\t\t\t},\n" + "\t\t\t\t{\n" + "\t\t\t\t\t\"index\": 2,\n" + "\t\t\t\t\t\"word\": \"than\",\n" + "\t\t\t\t\t\"characterOffsetBegin\": 21,\n" + - "\t\t\t\t\t\"characterOffsetEnd\": 25,\n" + - "\t\t\t\t\t\"before\": \" \",\n" + - "\t\t\t\t\t\"after\": \" \"\n" + + "\t\t\t\t\t\"characterOffsetEnd\": 25\n" + "\t\t\t\t},\n" + "\t\t\t\t{\n" + "\t\t\t\t\t\"index\": 3,\n" + "\t\t\t\t\t\"word\": \"XML\",\n" + "\t\t\t\t\t\"characterOffsetBegin\": 26,\n" + - "\t\t\t\t\t\"characterOffsetEnd\": 29,\n" + - "\t\t\t\t\t\"before\": \" \",\n" + - "\t\t\t\t\t\"after\": \"\"\n" + + "\t\t\t\t\t\"characterOffsetEnd\": 29\n" + "\t\t\t\t},\n" + "\t\t\t\t{\n" + "\t\t\t\t\t\"index\": 4,\n" + "\t\t\t\t\t\"word\": \".\",\n" + "\t\t\t\t\t\"characterOffsetBegin\": 29,\n" + - "\t\t\t\t\t\"characterOffsetEnd\": 30,\n" + - "\t\t\t\t\t\"before\": \"\",\n" + - "\t\t\t\t\t\"after\": \"\"\n" + + "\t\t\t\t\t\"characterOffsetEnd\": 30\n" + "\t\t\t\t}\n" + "\t\t\t]\n" + "\t\t}\n" + diff --git a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPatternTest.java b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPatternTest.java index 5e802f5eac..f874b001d6 100644 --- a/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPatternTest.java +++ b/test/src/edu/stanford/nlp/semgraph/semgrex/SemgrexPatternTest.java @@ -16,10 +16,7 @@ import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphFactory; -/** - * @author Chloe Kiddon - * @author Sonal Gupta - */ + public class SemgrexPatternTest extends TestCase { /* @@ -43,9 +40,9 @@ public void testPrettyPrint() { } public void testFind() throws Exception { - SemanticGraph h = SemanticGraph.valueOf("[married/VBN nsubjpass>Hughes/NNP auxpass>was/VBD prep_to>Gracia/NNP]"); + SemanticGraph h = SemanticGraph.valueOf("[married/VBN nsubjpass:Hughes/NNP auxpass:was/VBD prep_to:Gracia/NNP]"); SemanticGraph t = SemanticGraph - .valueOf("[loved/VBD\nnsubj>Hughes/NNP\ndobj>[wife/NN poss>his/PRP$ appos>Gracia/NNP]\nconj_and>[obsessed/JJ\ncop>was/VBD\nadvmod>absolutely/RB\nprep_with>[Elicia/NN poss>his/PRP$ amod>little/JJ compound>daughter/NN]]]"); + .valueOf("[loved/VBD\nnsubj:Hughes/NNP\ndobj:[wife/NN poss:his/PRP$ appos:Gracia/NNP]\nconj_and:[obsessed/JJ\ncop:was/VBD\nadvmod:absolutely/RB\nprep_with:[Elicia/NN poss:his/PRP$ amod:little/JJ nn:daughter/NN]]]"); String s = "(ROOT\n(S\n(NP (DT The) (NN chimney) (NNS sweeps))\n(VP (VBP do) (RB not)\n(VP (VB like)\n(S\n(VP (VBG working)\n(PP (IN on)\n(NP (DT an) (JJ empty) (NN stomach)))))))\n(. .)))"; Tree tree = Tree.valueOf(s); @@ -123,8 +120,9 @@ public void testFind() throws Exception { public void testMacro() throws IOException { SemanticGraph h = SemanticGraph.valueOf("[married/VBN nsubjpass>Hughes/NNP auxpass>was/VBD nmod:to>Gracia/NNP]"); String macro = "macro WORD = married"; + SemgrexBatchParser parser = new SemgrexBatchParser(); String pattern = "({word:${WORD}}=parent >>nsubjpass {}=node)"; - List pats = SemgrexBatchParser.compileStream(new ByteArrayInputStream((macro + "\n" + pattern).getBytes(StandardCharsets.UTF_8))); + List pats = parser.compileStream(new ByteArrayInputStream((macro + "\n" + pattern).getBytes(StandardCharsets.UTF_8))); SemgrexPattern pat3 = pats.get(0); boolean ignoreCase = true; SemgrexMatcher mat3 = pat3.matcher(h, ignoreCase); @@ -144,10 +142,11 @@ public void testEnv() throws IOException { //SemanticGraph t = SemanticGraph // .valueOf("[loved/VBD\nnsubj:Hughes/NNP\ndobj:[wife/NN poss:his/PRP$ appos:Gracia/NNP]\nconj_and:[obsessed/JJ\ncop:was/VBD\nadvmod:absolutely/RB\nprep_with:[Elicia/NN poss:his/PRP$ amod:little/JJ nn:daughter/NN]]]"); String macro = "macro WORD = married"; + SemgrexBatchParser parser = new SemgrexBatchParser(); Env env = new Env(); env.bind("pattern1",PatternsAnnotations.PatternLabel1.class); String pattern = "({pattern1:YES}=parent >>nsubjpass {}=node)"; - List pats = SemgrexBatchParser.compileStream(new ByteArrayInputStream((macro + "\n" + pattern).getBytes(StandardCharsets.UTF_8)), env); + List pats = parser.compileStream(new ByteArrayInputStream((macro + "\n" + pattern).getBytes(StandardCharsets.UTF_8)), env); SemgrexPattern pat3 = pats.get(0); boolean ignoreCase = true; SemgrexMatcher mat3 = pat3.matcher(h, ignoreCase); @@ -171,96 +170,4 @@ public void testSerialization() throws IOException, ClassNotFoundException { Assert.assertEquals(pat3, pat4); } - public void testSiblingPatterns() { - SemanticGraph sg = SemanticGraph.valueOf("[loved/VBD-2\nnsubj>Hughes/NNP-1\ndobj>[wife/NN-4 nmod:poss>his/PRP$-3 appos>Gracia/NNP-5]\nconj:and>[obsessed/JJ-9\ncop>was/VBD-7\nadvmod>absolutely/RB-8\nnmod:with>[Elicia/NN-14 nmod:poss>his/PRP$-11 amod>little/JJ-12 compound>daughter/NN-13]]]"); - - /* Test "." */ - - SemgrexPattern pat1 = SemgrexPattern.compile("{tag:NNP}=w1 . {tag:VBD}=w2"); - SemgrexMatcher matcher = pat1.matcher(sg); - if (matcher.find()) { - String w1 = matcher.getNode("w1").word(); - String w2 = matcher.getNode("w2").word(); - Assert.assertEquals("Hughes", w1); - Assert.assertEquals("loved", w2); - } else { - throw new RuntimeException("failed!"); - } - - /* Test "$+" */ - - SemgrexPattern pat2 = SemgrexPattern.compile("{word:was}=w1 $+ {}=w2"); - matcher = pat2.matcher(sg); - if (matcher.find()) { - String w1 = matcher.getNode("w1").word(); - String w2 = matcher.getNode("w2").word(); - Assert.assertEquals("was", w1); - Assert.assertEquals("absolutely", w2); - } else { - throw new RuntimeException("failed!"); - } - - /* Test "$-" */ - SemgrexPattern pat3 = SemgrexPattern.compile("{word:absolutely}=w1 $- {}=w2"); - matcher = pat3.matcher(sg); - if (matcher.find()) { - String w1 = matcher.getNode("w1").word(); - String w2 = matcher.getNode("w2").word(); - Assert.assertEquals("absolutely", w1); - Assert.assertEquals("was", w2); - } else { - throw new RuntimeException("failed!"); - } - - /* Test "$++" */ - SemgrexPattern pat4 = SemgrexPattern.compile("{word:his}=w1 $++ {tag:NN}=w2"); - matcher = pat4.matcher(sg); - if (matcher.find()) { - String w1 = matcher.getNode("w1").word(); - String w2 = matcher.getNode("w2").word(); - Assert.assertEquals("his", w1); - Assert.assertEquals("daughter", w2); - } else { - throw new RuntimeException("failed!"); - } - - - - /* Test "$--" */ - SemgrexPattern pat6 = SemgrexPattern.compile("{word:daughter}=w1 $-- {tag:/PRP./}=w2"); - matcher = pat6.matcher(sg); - if (matcher.find()) { - String w1 = matcher.getNode("w1").word(); - String w2 = matcher.getNode("w2").word(); - Assert.assertEquals("daughter", w1); - Assert.assertEquals("his", w2); - } else { - throw new RuntimeException("failed!"); - } - - /* Test for not matching. */ - SemgrexPattern pat5 = SemgrexPattern.compile("{word:his}=w1 $-- {}=w2"); - matcher = pat5.matcher(sg); - if (matcher.find()) { - throw new RuntimeException("failed!"); - } - - /* Test for negation. */ - SemgrexPattern pat7 = SemgrexPattern.compile("{word:his}=w1 !$-- {}"); - matcher = pat7.matcher(sg); - if (matcher.find()) { - String w1 = matcher.getNode("w1").word(); - Assert.assertEquals("his", w1); - } else { - throw new RuntimeException("failed!"); - } - - SemgrexPattern pat8 = SemgrexPattern.compile("{word:his}=w1 !$++ {}"); - matcher = pat8.matcher(sg); - if (matcher.find()) { - throw new RuntimeException("failed!"); - } - - } - }