Skip to content

Commit

Permalink
Merge branch 'java8' into kbp
Browse files Browse the repository at this point in the history
  • Loading branch information
gangeli authored and Stanford NLP committed Mar 12, 2015
1 parent a773050 commit 703ce23
Show file tree
Hide file tree
Showing 45 changed files with 950 additions and 301 deletions.
26 changes: 25 additions & 1 deletion build.xml
Expand Up @@ -389,5 +389,29 @@
</war> </war>
</target> </target>


</project> <property environment="env" />

<condition property="version1.7">
<equals arg1="${ant.java.version}" arg2="1.7" />
</condition>

<target name="tregex-osx" if="version1.7" depends="jar"
description="Build an OS X app for TregexGUI">
<fail unless="env.JAVA_HOME"
message="Environment variable JAVA_HOME not set." />

<taskdef name="bundleapp"
classname="com.oracle.appbundler.AppBundlerTask"
classpath="lib/appbundler-1.0.jar" />

<bundleapp outputdirectory="."
name="Tregex GUI"
displayname="Tregex GUI"
identifier="edu.stanford.nlp.trees.tregex.gui.TregexGUI"
mainclassname="edu.stanford.nlp.trees.tregex.gui.TregexGUI">
<runtime dir="${env.JAVA_HOME}" />
<classpath file="javanlp-core.jar" />
</bundleapp>
</target>


</project>
@@ -0,0 +1,20 @@
package edu.stanford.nlp.pipeline;

import junit.framework.TestCase;

import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;

/**
* @author Christopher Manning
*/
public class PosParserTagCompatibilityITest extends TestCase {

public void testEnglishTagSet() {
LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
MaxentTagger tagger = new MaxentTagger("edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger");
assertEquals("English (PCFG/left3words) tagger/parser tag set mismatch",
lp.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()), tagger.getTags().tagSet());
}

}
19 changes: 12 additions & 7 deletions itest/src/edu/stanford/nlp/pipeline/StanfordCoreNLPITest.java
@@ -1,5 +1,7 @@
package edu.stanford.nlp.pipeline; package edu.stanford.nlp.pipeline;


import edu.stanford.nlp.ie.machinereading.structure.MachineReadingAnnotations;
import edu.stanford.nlp.ie.machinereading.structure.RelationMention;
import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.trees.TreeCoreAnnotations; import edu.stanford.nlp.trees.TreeCoreAnnotations;
Expand Down Expand Up @@ -173,15 +175,18 @@ public void testRelationExtractor() throws Exception {
// Check the regexner is integrated with the StanfordCoreNLP // Check the regexner is integrated with the StanfordCoreNLP
Properties props = new Properties(); Properties props = new Properties();
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,relation"); props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,parse,relation");

//props.setProperty("sup.relation.model", "/home/sonalg/javanlp/tmp/roth_relation_model_pipeline.ser");
String text = "Barack Obama is the 44th President of the United States. He is the first African American president."; String text = "Barack Obama, a Yale professor, is president.";
Annotation document = new Annotation(text); Annotation document = new Annotation(text);
StanfordCoreNLP pipeline = new StanfordCoreNLP(props); StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
pipeline.annotate(document); pipeline.annotate(document);

CoreMap sentence = document.get(CoreAnnotations.SentencesAnnotation.class).get(0);
StringWriter stringWriter = new StringWriter(); List<RelationMention> rel = sentence.get(MachineReadingAnnotations.RelationMentionsAnnotation.class);
pipeline.prettyPrint(document, new PrintWriter(stringWriter)); assertEquals(rel.get(0).getType(),"Work_For");
String result = stringWriter.getBuffer().toString(); // StringWriter stringWriter = new StringWriter();
// pipeline.prettyPrint(document, new PrintWriter(stringWriter));
// String result = stringWriter.getBuffer().toString();
// System.out.println(result);
} }




Expand Down Expand Up @@ -230,7 +235,7 @@ private boolean contains(String string, String regexp) {
Matcher matcher = pattern.matcher(string); Matcher matcher = pattern.matcher(string);
return matcher.find(); return matcher.find();
} }

public void testSerialization() public void testSerialization()
throws Exception throws Exception
{ {
Expand Down
Binary file added lib/appbundler-1.0.jar
Binary file not shown.
43 changes: 35 additions & 8 deletions src/edu/stanford/nlp/classify/ColumnDataClassifier.java
Expand Up @@ -36,9 +36,13 @@
import edu.stanford.nlp.ling.BasicDatum; import edu.stanford.nlp.ling.BasicDatum;
import edu.stanford.nlp.ling.Datum; import edu.stanford.nlp.ling.Datum;
import edu.stanford.nlp.ling.RVFDatum; import edu.stanford.nlp.ling.RVFDatum;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.objectbank.ObjectBank; import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.optimization.DiffFunction; import edu.stanford.nlp.optimization.DiffFunction;
import edu.stanford.nlp.optimization.Minimizer; import edu.stanford.nlp.optimization.Minimizer;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.WordShapeClassifier; import edu.stanford.nlp.process.WordShapeClassifier;
import edu.stanford.nlp.stats.*; import edu.stanford.nlp.stats.*;
import edu.stanford.nlp.util.*; import edu.stanford.nlp.util.*;
Expand Down Expand Up @@ -149,6 +153,7 @@
* <tr><td> splitWordsRegexp</td><td>String</td><td>null</td><td>If defined, use this as a regular expression on which to split the whole string (as in the String.split() function, which will return the things between delimiters, and discard the delimiters). The resulting split-up "words" will be used in classifier features iff one of the other "useSplit" options is turned on.</td></tr> * <tr><td> splitWordsRegexp</td><td>String</td><td>null</td><td>If defined, use this as a regular expression on which to split the whole string (as in the String.split() function, which will return the things between delimiters, and discard the delimiters). The resulting split-up "words" will be used in classifier features iff one of the other "useSplit" options is turned on.</td></tr>
* <tr><td> splitWordsTokenizerRegexp</td><td>String</td><td>null</td><td>If defined, use this as a regular expression to cut initial pieces off a String. Either this regular expression or <code>splitWordsIgnoreRegexp</code> <i>should always match</i> the start of the String, and the size of the token is the number of characters matched. So, for example, one can group letter and number characters but do nothing else with a regular expression like <code>([A-Za-z]+|[0-9]+|.)</code>, where the last disjunct will match any other single character. (If neither regular expression matches, the first character of the string is treated as a one character word, and then matching is tried again, but in this case a warning message is printed.) Note that, for Java regular expressions with disjunctions like this, the match is the first matching disjunction, not the longest matching disjunction, so patterns with common prefixes need to be ordered from most specific (longest) to least specific (shortest).) The resulting split up "words" will be used in classifier features iff one of the other "useSplit" options is turned on. Note that as usual for Java String processing, backslashes must be doubled in the regular expressions that you write.</td></tr> * <tr><td> splitWordsTokenizerRegexp</td><td>String</td><td>null</td><td>If defined, use this as a regular expression to cut initial pieces off a String. Either this regular expression or <code>splitWordsIgnoreRegexp</code> <i>should always match</i> the start of the String, and the size of the token is the number of characters matched. So, for example, one can group letter and number characters but do nothing else with a regular expression like <code>([A-Za-z]+|[0-9]+|.)</code>, where the last disjunct will match any other single character. (If neither regular expression matches, the first character of the string is treated as a one character word, and then matching is tried again, but in this case a warning message is printed.) Note that, for Java regular expressions with disjunctions like this, the match is the first matching disjunction, not the longest matching disjunction, so patterns with common prefixes need to be ordered from most specific (longest) to least specific (shortest).) The resulting split up "words" will be used in classifier features iff one of the other "useSplit" options is turned on. Note that as usual for Java String processing, backslashes must be doubled in the regular expressions that you write.</td></tr>
* <tr><td> splitWordsIgnoreRegexp</td><td>String</td><td>\\s+</td><td>If non-empty, this regexp is used to determine character sequences which should not be returned as tokens when using <code>splitWordsTokenizerRegexp</code> or <code>splitWordsRegexp</code>. With the former, first the program attempts to match this regular expression at the start of the string (with <code>lookingAt()</code>) and if it matches, those characters are discarded, but if it doesn't match then <code>splitWordsTokenizerRegexp</code> is tried. With <code>splitWordsRegexp</code>, this is used to filter tokens (with <code>matches()</code> resulting from the splitting. By default this regular expression is set to be all whitespace tokens (i.e., \\s+). Set it to an empty string to get all tokens returned.</td></tr> * <tr><td> splitWordsIgnoreRegexp</td><td>String</td><td>\\s+</td><td>If non-empty, this regexp is used to determine character sequences which should not be returned as tokens when using <code>splitWordsTokenizerRegexp</code> or <code>splitWordsRegexp</code>. With the former, first the program attempts to match this regular expression at the start of the string (with <code>lookingAt()</code>) and if it matches, those characters are discarded, but if it doesn't match then <code>splitWordsTokenizerRegexp</code> is tried. With <code>splitWordsRegexp</code>, this is used to filter tokens (with <code>matches()</code> resulting from the splitting. By default this regular expression is set to be all whitespace tokens (i.e., \\s+). Set it to an empty string to get all tokens returned.</td></tr>
* <tr><td> splitWordsWithPTBTokenizer</td><td>boolean</td><td>false</td><td>If true, and <code>splitWordsRegexp</code> and <code>splitWordsTokenizerRegexp</code> are false, then will tokenize using the <code>PTBTokenizer</code></td></tr>
* <tr><td> useSplitWords</td><td>boolean</td><td>false</td><td>Make features from the "words" that are returned by dividing the string on splitWordsRegexp or splitWordsTokenizerRegexp. Requires splitWordsRegexp or splitWordsTokenizerRegexp.</td><td>SW-<i>str</i></td></tr> * <tr><td> useSplitWords</td><td>boolean</td><td>false</td><td>Make features from the "words" that are returned by dividing the string on splitWordsRegexp or splitWordsTokenizerRegexp. Requires splitWordsRegexp or splitWordsTokenizerRegexp.</td><td>SW-<i>str</i></td></tr>
* <tr><td> useLowercaseSplitWords</td><td>boolean</td><td>false</td><td>Make features from the "words" that are returned by dividing the string on splitWordsRegexp or splitWordsTokenizerRegexp and then lowercasing the result. Requires splitWordsRegexp or splitWordsTokenizerRegexp. Note that this can be specified independently of useSplitWords. You can put either or both original cased and lowercased words in as features.</td><td>SW-<i>str</i></td></tr> * <tr><td> useLowercaseSplitWords</td><td>boolean</td><td>false</td><td>Make features from the "words" that are returned by dividing the string on splitWordsRegexp or splitWordsTokenizerRegexp and then lowercasing the result. Requires splitWordsRegexp or splitWordsTokenizerRegexp. Note that this can be specified independently of useSplitWords. You can put either or both original cased and lowercased words in as features.</td><td>SW-<i>str</i></td></tr>
* <tr><td> useSplitWordPairs</td><td>boolean</td><td>false</td><td>Make features from the pairs of adjacent "words" that are returned by dividing the string into splitWords. Requires splitWordsRegexp or splitWordsTokenizerRegexp.</td><td>SWP-<i>str1</i>-<i>str2</i></td></tr> * <tr><td> useSplitWordPairs</td><td>boolean</td><td>false</td><td>Make features from the pairs of adjacent "words" that are returned by dividing the string into splitWords. Requires splitWordsRegexp or splitWordsTokenizerRegexp.</td><td>SWP-<i>str1</i>-<i>str2</i></td></tr>
Expand Down Expand Up @@ -225,6 +230,7 @@ public class ColumnDataClassifier {
private final Flags[] flags; private final Flags[] flags;
private final Flags globalFlags; // simply points to flags[0] private final Flags globalFlags; // simply points to flags[0]
private Classifier<String,String> classifier; // really only assigned once too (either in train or load in setProperties) private Classifier<String,String> classifier; // really only assigned once too (either in train or load in setProperties)
private TokenizerFactory<Word> ptbFactory;




/** /**
Expand All @@ -236,7 +242,7 @@ public class ColumnDataClassifier {
* @return A Datum (may be an RVFDatum; never null) * @return A Datum (may be an RVFDatum; never null)
*/ */
public Datum<String,String> makeDatumFromLine(String line) { public Datum<String,String> makeDatumFromLine(String line) {
return makeDatumFromStrings(tab.split(line)); return makeDatumFromStrings(splitLineToFields(line));
} }




Expand All @@ -245,7 +251,7 @@ public Datum<String,String> makeDatumFromLine(String line) {
* If real-valued features are used, this method accesses makeRVFDatumFromLine * If real-valued features are used, this method accesses makeRVFDatumFromLine
* and returns an RVFDatum; otherwise, categorical features are used. * and returns an RVFDatum; otherwise, categorical features are used.
* *
* @param strings The elements that features a made from (the tab-split columns of a TSV file) * @param strings The elements that features are made from (the columns of a TSV/CSV file)
* @return A Datum (may be an RVFDatum; never null) * @return A Datum (may be an RVFDatum; never null)
*/ */
public Datum<String,String> makeDatumFromStrings(String[] strings) { public Datum<String,String> makeDatumFromStrings(String[] strings) {
Expand Down Expand Up @@ -846,7 +852,8 @@ private static <F> void addFeature(Object features, F newFeature, double value)
* @param cWord The String to extract data from * @param cWord The String to extract data from
*/ */
private void makeDatum(String cWord, Flags flags, Object featuresC, String goldAns) { private void makeDatum(String cWord, Flags flags, Object featuresC, String goldAns) {
//System.err.println("Making features for " + cWord + " flags " + flags);
//System.err.println("Making features for " + cWord + " flags " + flags);
if (flags == null) { if (flags == null) {
// no features for this column // no features for this column
return; return;
Expand Down Expand Up @@ -918,12 +925,15 @@ private void makeDatum(String cWord, Flags flags, Object featuresC, String goldA
addFeature(featuresC,featureName,DEFAULT_VALUE); addFeature(featuresC,featureName,DEFAULT_VALUE);
} }
} }
if (flags.splitWordsPattern != null || flags.splitWordsTokenizerPattern != null ) { if (flags.splitWordsPattern != null || flags.splitWordsTokenizerPattern != null ||
flags.splitWordsWithPTBTokenizer) {
String[] bits; String[] bits;
if (flags.splitWordsTokenizerPattern != null) { if (flags.splitWordsTokenizerPattern != null) {
bits = regexpTokenize(flags.splitWordsTokenizerPattern, flags.splitWordsIgnorePattern, cWord); bits = regexpTokenize(flags.splitWordsTokenizerPattern, flags.splitWordsIgnorePattern, cWord);
} else { } else if (flags.splitWordsPattern != null) {
bits = splitTokenize(flags.splitWordsPattern, flags.splitWordsIgnorePattern, cWord); bits = splitTokenize(flags.splitWordsPattern, flags.splitWordsIgnorePattern, cWord);
} else { //PTB tokenizer
bits = ptbTokenize(cWord);
} }
if (flags.showTokenization) { if (flags.showTokenization) {
System.err.print("Tokenization: "); System.err.print("Tokenization: ");
Expand Down Expand Up @@ -1021,6 +1031,18 @@ private void makeDatum(String cWord, Flags flags, Object featuresC, String goldA
//System.err.println("Made featuresC " + featuresC); //System.err.println("Made featuresC " + featuresC);
} //end makeDatum } //end makeDatum


//return the tokens using PTB tokenizer
private String[] ptbTokenize(String cWord) {
if(ptbFactory==null)
ptbFactory = PTBTokenizer.factory();
Tokenizer<Word> tokenizer = ptbFactory.getTokenizer(new StringReader(cWord));
List<Word> words = tokenizer.tokenize();
String[] res = new String[words.size()];
for(int i = 0; i < words.size(); ++i) {
res[i] = words.get(i).word();
}
return res;
}


/** /**
* Caches a hash of word to all substring features. A <i>lot</i> of memory! * Caches a hash of word to all substring features. A <i>lot</i> of memory!
Expand Down Expand Up @@ -1390,8 +1412,9 @@ private Flags[] setProperties(Properties props) {
for (Enumeration<?> e = props.propertyNames(); e.hasMoreElements();) { for (Enumeration<?> e = props.propertyNames(); e.hasMoreElements();) {
String key = (String) e.nextElement(); String key = (String) e.nextElement();
String val = props.getProperty(key); String val = props.getProperty(key);

int col = 0; // the default (first after class) int col = 0; // the default (first after class)
// System.err.println(key + " = " + val); System.err.println(key + " = " + val);
Matcher matcher = prefix.matcher(key); Matcher matcher = prefix.matcher(key);
if (matcher.matches()) { if (matcher.matches()) {
col = Integer.parseInt(matcher.group(1)); col = Integer.parseInt(matcher.group(1));
Expand Down Expand Up @@ -1663,8 +1686,10 @@ private Flags[] setProperties(Properties props) {
} else if (key.equals("shuffleSeed")) { } else if (key.equals("shuffleSeed")) {
myFlags[col].shuffleSeed = Long.parseLong(val); myFlags[col].shuffleSeed = Long.parseLong(val);
} else if (key.equals("csvFormat")) { } else if (key.equals("csvFormat")) {
myFlags[col].csvFormat=true; myFlags[col].csvFormat= Boolean.parseBoolean(val);

} else if (key.equals("splitWordsWithPTBTokenizer")) {
System.out.println("splitting with ptb tokenizer");
myFlags[col].splitWordsWithPTBTokenizer=Boolean.parseBoolean(val);
} else if ( ! key.isEmpty() && ! key.equals("prop")) { } else if ( ! key.isEmpty() && ! key.equals("prop")) {
System.err.println("Unknown property: |" + key + '|'); System.err.println("Unknown property: |" + key + '|');
} }
Expand Down Expand Up @@ -2014,7 +2039,9 @@ static class Flags implements Serializable {
int crossValidationFolds = -1; int crossValidationFolds = -1;
boolean shuffleTrainingData = false; boolean shuffleTrainingData = false;
long shuffleSeed = 0; long shuffleSeed = 0;

static boolean csvFormat = false; //train and test files are in csv format static boolean csvFormat = false; //train and test files are in csv format
boolean splitWordsWithPTBTokenizer = false;


@Override @Override
public String toString() { public String toString() {
Expand Down
3 changes: 3 additions & 0 deletions src/edu/stanford/nlp/ie/NERClassifierCombiner.java
Expand Up @@ -10,6 +10,7 @@
import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.RuntimeInterruptedException;
import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.StringUtils;


/** /**
Expand Down Expand Up @@ -114,6 +115,8 @@ public List<CoreLabel> classifyWithGlobalInformation(List<CoreLabel> tokens, fin
// note: requires TextAnnotation, PartOfSpeechTagAnnotation, and AnswerAnnotation // note: requires TextAnnotation, PartOfSpeechTagAnnotation, and AnswerAnnotation
// note: this sets AnswerAnnotation! // note: this sets AnswerAnnotation!
recognizeNumberSequences(output, document, sentence); recognizeNumberSequences(output, document, sentence);
} catch (RuntimeInterruptedException e) {
throw e;
} catch (Exception e) { } catch (Exception e) {
System.err.println("Ignored an exception in NumberSequenceClassifier: (result is that some numbers were not classified)"); System.err.println("Ignored an exception in NumberSequenceClassifier: (result is that some numbers were not classified)");
System.err.println("Tokens: " + StringUtils.joinWords(tokens, " ")); System.err.println("Tokens: " + StringUtils.joinWords(tokens, " "));
Expand Down
37 changes: 35 additions & 2 deletions src/edu/stanford/nlp/ie/machinereading/GenericDataSetReader.java
Expand Up @@ -4,9 +4,9 @@
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Properties;
import java.util.logging.Level; import java.util.logging.Level;
import java.util.logging.Logger; import java.util.logging.Logger;
import java.util.regex.Pattern;


import edu.stanford.nlp.ie.machinereading.common.NoPunctuationHeadFinder; import edu.stanford.nlp.ie.machinereading.common.NoPunctuationHeadFinder;
import edu.stanford.nlp.ie.machinereading.structure.EntityMention; import edu.stanford.nlp.ie.machinereading.structure.EntityMention;
Expand All @@ -15,6 +15,9 @@
import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label; import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.trees.TreeCoreAnnotations; import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.parser.common.ParserAnnotations; import edu.stanford.nlp.parser.common.ParserAnnotations;
import edu.stanford.nlp.parser.common.ParserConstraint; import edu.stanford.nlp.parser.common.ParserConstraint;
Expand Down Expand Up @@ -135,10 +138,38 @@ public final Annotation parse(String path) throws IOException {


if (preProcessSentences) { if (preProcessSentences) {
preProcessSentences(retVal); preProcessSentences(retVal);
if(MachineReadingProperties.trainUsePipelineNER){
logger.severe("Changing NER tags using the CoreNLP pipeline.");
modifyUsingCoreNLPNER(retVal);
}
} }

return retVal; return retVal;
} }

private void modifyUsingCoreNLPNER(Annotation doc) {
Properties ann = new Properties();
ann.setProperty("annotators", "pos, lemma, ner");
StanfordCoreNLP pipeline = new StanfordCoreNLP(ann, false);
pipeline.annotate(doc);
for (CoreMap sentence : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
List<EntityMention> entities = sentence.get(MachineReadingAnnotations.EntityMentionsAnnotation.class);
if (entities != null) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
for (EntityMention en : entities) {
//System.out.println("old ner tag for " + en.getExtentString() + " was " + en.getType());
Span s = en.getExtent();
Counter<String> allNertagforSpan = new ClassicCounter<String>();
for (int i = s.start(); i < s.end(); i++) {
allNertagforSpan.incrementCount(tokens.get(i).ner());
}
String entityNertag = Counters.argmax(allNertagforSpan);
en.setType(entityNertag);
//System.out.println("new ner tag is " + entityNertag);
}
}

}
}


public Annotation read(String path) throws Exception { public Annotation read(String path) throws Exception {
return null; return null;
Expand Down Expand Up @@ -466,6 +497,8 @@ private static CoreLabel initCoreLabel(String token) {
label.setWord(token); label.setWord(token);
label.setValue(token); label.setValue(token);
label.set(CoreAnnotations.TextAnnotation.class, token); label.set(CoreAnnotations.TextAnnotation.class, token);
label.set(CoreAnnotations.ValueAnnotation.class, token);

return label; return label;
} }


Expand Down
Expand Up @@ -144,6 +144,9 @@ public class MachineReadingProperties {


@Option(name="loadModel",gloss="if true, load a serialized model rather than training a new one") @Option(name="loadModel",gloss="if true, load a serialized model rather than training a new one")
static protected boolean loadModel = false; static protected boolean loadModel = false;

@Option(name="trainUsePipelineNER", gloss="during training, use NER generated by the CoreNLP pipeline")
static public boolean trainUsePipelineNER = false;


/** /**
* evaluation options (ignored if trainOnly is true) * evaluation options (ignored if trainOnly is true)
Expand Down
Expand Up @@ -250,6 +250,7 @@ private List<CoreMap> readDocument(String prefix, Annotation corpus) throws IOEx
for(int i = 0; i < tokens.size(); i ++){ for(int i = 0; i < tokens.size(); i ++){
CoreLabel l = new CoreLabel(); CoreLabel l = new CoreLabel();
l.setWord(tokens.get(i).getLiteral()); l.setWord(tokens.get(i).getLiteral());
l.set(CoreAnnotations.ValueAnnotation.class, l.word());
l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart()); l.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, tokens.get(i).getByteStart());
l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd()); l.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, tokens.get(i).getByteEnd());
words.add(l); words.add(l);
Expand Down

0 comments on commit 703ce23

Please sign in to comment.