diff --git a/JavaNLP-core.iml b/JavaNLP-core.iml
index 07016325ce..fb08673c08 100644
--- a/JavaNLP-core.iml
+++ b/JavaNLP-core.iml
@@ -1,5 +1,5 @@
-
+
@@ -311,5 +311,4 @@
-
-
+
\ No newline at end of file
diff --git a/itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java b/itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java
index 47c2e401d6..47ad36b348 100644
--- a/itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java
+++ b/itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java
@@ -36,30 +36,30 @@ private static void checkLabels(StanfordCoreNLP pipe, String text, String [] lab
CoreMap sent = doc.get(CoreAnnotations.SentencesAnnotation.class).get(0);
assertTrue(sent.get(CoreAnnotations.TokensAnnotation.class) != null);
List tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
- if(VERBOSE){
+ if (VERBOSE) {
for(CoreLabel token: tokens) {
- System.out.println("\t" + token.word() + " " +
- token.tag() + " " +
- token.ner() + " " +
+ System.out.println('\t' + token.word() + ' ' +
+ token.tag() + ' ' +
+ token.ner() + ' ' +
(token.containsKey(CoreAnnotations.NumericCompositeTypeAnnotation.class) ? token.get(CoreAnnotations.NumericCompositeValueAnnotation.class) + " " : "") +
- (token.containsKey(TimeAnnotations.TimexAnnotation.class) ? token.get(TimeAnnotations.TimexAnnotation.class) + " " : ""));
+ (token.containsKey(TimeAnnotations.TimexAnnotation.class) ? token.get(TimeAnnotations.TimexAnnotation.class) + " " : ""));
}
}
-
+
// check NER labels
assertTrue(tokens.size() == labels.length);
- for(int i = 0; i < labels.length; i ++){
+ for (int i = 0; i < labels.length; i ++) {
if(labels[i] == null){
assertTrue(tokens.get(i).ner() == null);
} else {
Pattern p = Pattern.compile(labels[i]);
System.err.println("COMPARING NER " + labels[i] + " with " + tokens.get(i).ner());
System.err.flush();
- assertTrue(tokens.get(i).ner() != null);
+ assertTrue("NER should not be null for token " + tokens.get(i) + " in sentence " + tokens, tokens.get(i).ner() != null);
assertTrue(tokens.get(i).ner() + " does not match " + p + " for token " + tokens.get(i) + " in sentence " + tokens, p.matcher(tokens.get(i).ner()).matches());
}
}
-
+
// check normalized values, if gold is given
if(normed != null){
assertTrue(tokens.size() == normed.length);
@@ -70,8 +70,8 @@ private static void checkLabels(StanfordCoreNLP pipe, String text, String [] lab
Pattern p = Pattern.compile(normed[i]);
String n = tokens.get(i).get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
String message = "COMPARING NORMED \"" + normed[i] + "\" with \"" + n + "\"";
- assertTrue(message, n != null);
- assertTrue(message, p.matcher(n).matches());
+ assertTrue(message + "; latter should not be null", n != null);
+ assertTrue(message + "; latter should match", p.matcher(n).matches());
}
}
}
@@ -83,8 +83,8 @@ private static void run(String header, String [] texts, String [][] answers, Str
if(VERBOSE) {
System.out.println("Running test " + header + " for text: " + texts[i]);
}
- checkLabels(pipe,
- texts[i],
+ checkLabels(pipe,
+ texts[i],
answers[i],
normed != null ? normed[i] : null);
}
@@ -100,10 +100,10 @@ private static void run(String header, String [] texts, String [][] answers, Str
"It cost four million dollars",
"It cost $1m",
"It cost 50 cents",
- "It cost # 1500",
+ "It cost £ 1500",
"It cost \u00A3 1500",
"It cost \u00A3 .50",
- "It cost # .50",
+ "It cost € .50",
"It cost $ 1500",
"It cost $1500",
"It cost $ 1,500",
@@ -111,8 +111,10 @@ private static void run(String header, String [] texts, String [][] answers, Str
"It cost $48.75",
"It cost $ 57.60",
"It cost $8 thousand",
- "It cost $42,33"
+ "It cost $42,33",
+// "It cost ₩1500", // TODO: Add won symbol to PTBTokenizer
};
+
private static final String [][] moneyAnswers = {
{ null, null, "MONEY", "MONEY" },
{ null, null, "MONEY", "MONEY" },
@@ -134,8 +136,10 @@ private static void run(String header, String [] texts, String [][] answers, Str
{ null, null, "MONEY", "MONEY" },
{ null, null, "MONEY", "MONEY" },
{ null, null, "MONEY", "MONEY", "MONEY" },
- { null, null, "MONEY", "MONEY" }
+ { null, null, "MONEY", "MONEY" },
+// { null, null, "MONEY", "MONEY" },
};
+
private static final String [][] moneyNormed = {
{ null, null, "\\$5.0", "\\$5.0" },
{ null, null, "\\$0.24", "\\$0.24" },
@@ -149,7 +153,7 @@ private static void run(String header, String [] texts, String [][] answers, Str
{ null, null, "\u00A31500.0", "\u00A31500.0" },
{ null, null, "\u00A31500.0", "\u00A31500.0" },
{ null, null, "\u00A30.5", "\u00A30.5" },
- { null, null, "\u00A30.5", "\u00A30.5" },
+ { null, null, "\\$0.5", "\\$0.5" }, // TODO: Fix PTBTokenizer to really normalize it to Euro €
{ null, null, "\\$1500.0", "\\$1500.0" },
{ null, null, "\\$1500.0", "\\$1500.0" },
{ null, null, "\\$1500.0", "\\$1500.0" },
@@ -157,8 +161,10 @@ private static void run(String header, String [] texts, String [][] answers, Str
{ null, null, "\\$48.75", "\\$48.75" },
{ null, null, "\\$57.6", "\\$57.6" },
{ null, null, "\\$8000.0", "\\$8000.0", "\\$8000.0" },
- { null, null, "\\$4233.0", "\\$4233.0" }
+ { null, null, "\\$4233.0", "\\$4233.0" },
+// { null, null, "₩4233.0", "₩4233.0" },
};
+
public void testMoney() {
run("MONEY", moneyStrings, moneyAnswers, moneyNormed);
}
@@ -185,7 +191,7 @@ public void testMoney() {
{ null, null, null, "1000.0", null },
};
public void testOrdinal() {
- run("ORDINAL", ordinalStrings, ordinalAnswers, ordinalNormed);
+ run("ORDINAL", ordinalStrings, ordinalAnswers, ordinalNormed);
}
private static final String [] dateStrings = {
@@ -243,7 +249,7 @@ public void testOrdinal() {
{ "2008-06-06" , "2008-06-06", "2008-06-06", null, "2008-06-07" , "2008-06-07", "2008-06-07" },
};
public void testDate() {
- run("DATE", dateStrings, dateAnswers, dateNormed);
+ run("DATE", dateStrings, dateAnswers, dateNormed);
}
private static final String [] numberStrings = {
@@ -280,9 +286,9 @@ public void testDate() {
{ "801.0", null, "123.0", null }
};
public void testNumber() {
- run("NUMBER", numberStrings, numberAnswers, numberNormed);
+ run("NUMBER", numberStrings, numberAnswers, numberNormed);
}
-
+
private static final String [] timeStrings = {
"the time was 10:20",
"12:29 p.m.",
diff --git a/itest/src/edu/stanford/nlp/pipeline/NERCombinerAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/NERCombinerAnnotatorITest.java
index fd56fc70e1..1030faa4c9 100644
--- a/itest/src/edu/stanford/nlp/pipeline/NERCombinerAnnotatorITest.java
+++ b/itest/src/edu/stanford/nlp/pipeline/NERCombinerAnnotatorITest.java
@@ -13,7 +13,7 @@
import java.util.Properties;
-/**
+/**
* @author Angel Chang
* @author John Bauer
*/
@@ -23,9 +23,9 @@ public class NERCombinerAnnotatorITest extends TestCase {
public static final String NER_7CLASS = DefaultPaths.DEFAULT_NER_MUC_MODEL;
public static final String NER_MISCCLASS = DefaultPaths.DEFAULT_NER_CONLL_MODEL;
- static NERCombinerAnnotator nerAnnotator = null;
- static AnnotationPipeline unthreadedPipeline = null;
- static AnnotationPipeline threaded4Pipeline = null;
+ private static NERCombinerAnnotator nerAnnotator = null;
+ private static AnnotationPipeline unthreadedPipeline = null;
+ private static AnnotationPipeline threaded4Pipeline = null;
/**
* Creates the tagger annotator if it isn't already created
@@ -42,7 +42,7 @@ public void setUp()
props.setProperty("ner.applyNumericClassifiers", "false");
props.setProperty("ner.useSUTime", "false");
props.setProperty("ner.model", NER_3CLASS);
- NERClassifierCombiner ner = NERCombinerAnnotator.createNERClassifierCombiner("ner", props);
+ NERClassifierCombiner ner = NERClassifierCombiner.createNERClassifierCombiner("ner", props);
NERCombinerAnnotator threaded4Annotator = new NERCombinerAnnotator(ner, false, 4, -1);
threaded4Pipeline = new AnnotationPipeline();
@@ -75,7 +75,7 @@ public void testThreadedAnnotator() {
verifyAnswers(ANSWERS, document);
}
- public void verifyAnswers(String[][] expected, Annotation document) {
+ public static void verifyAnswers(String[][] expected, Annotation document) {
int sentenceIndex = 0;
for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
@@ -100,7 +100,7 @@ public void verifyAnswers(String[][] expected, Annotation document) {
private static Iterator getTestData(String inputString, boolean includeAnswer)
{
- ColumnTabDocumentReaderWriter colReader = new ColumnTabDocumentReaderWriter();
+ ColumnTabDocumentReaderWriter colReader = new ColumnTabDocumentReaderWriter<>();
if (includeAnswer) {
colReader.init("word=0,tag=1,answer=2");
} else {
diff --git a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
index 4b9e541980..7de1948549 100644
--- a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
+++ b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
@@ -108,8 +108,7 @@ public DocumentReaderAndWriter plainTextReaderAndWriter() {
* Construct a SeqClassifierFlags object based on the passed in properties,
* and then call the other constructor.
*
- * @param props
- * See SeqClassifierFlags for known properties.
+ * @param props See SeqClassifierFlags for known properties.
*/
public AbstractSequenceClassifier(Properties props) {
this(new SeqClassifierFlags(props));
@@ -124,14 +123,13 @@ public AbstractSequenceClassifier(Properties props) {
public AbstractSequenceClassifier(SeqClassifierFlags flags) {
this.flags = flags;
- // try {
// Thang Sep13: allow for multiple feature factories.
this.featureFactories = Generics.newArrayList();
if (flags.featureFactory != null) {
FeatureFactory factory = new MetaClass(flags.featureFactory).createInstance(flags.featureFactoryArgs); // for compatibility
featureFactories.add(factory);
}
- if(flags.featureFactories!=null){
+ if (flags.featureFactories != null) {
for (int i = 0; i < flags.featureFactories.length; i++) {
FeatureFactory indFeatureFactory = new MetaClass(flags.featureFactories[i]).
createInstance(flags.featureFactoriesArgs.get(i));
@@ -142,11 +140,7 @@ public AbstractSequenceClassifier(SeqClassifierFlags flags) {
tokenFactory = (CoreTokenFactory) new CoreLabelTokenFactory();
} else {
this.tokenFactory = new MetaClass(flags.tokenFactory).createInstance(flags.tokenFactoryArgs);
- // this.tokenFactory = (CoreTokenFactory) Class.forName(flags.tokenFactory).newInstance();
}
- // } catch (Exception e) {
- // throw new RuntimeException(e);
- // }
pad = tokenFactory.makeToken();
windowSize = flags.maxLeft + 1;
reinit();
@@ -281,7 +275,7 @@ public List classifySentence(List extends HasWord> sentence) {
i++;
}
- // TODO get rid of objectbankwrapper
+ // TODO get rid of ObjectBankWrapper
ObjectBankWrapper wrapper = new ObjectBankWrapper(flags, null, knownLCWords);
wrapper.processDocument(document);
@@ -320,7 +314,7 @@ public List classifySentenceWithGlobalInformation(List extends HasWord> to
i++;
}
- // TODO get rid of objectbankwrapper
+ // TODO get rid of ObjectBankWrapper
ObjectBankWrapper wrapper = new ObjectBankWrapper(flags, null, knownLCWords);
wrapper.processDocument(document);
@@ -402,7 +396,7 @@ public DFSA getViterbiSearchGraph(List doc, Class extends
if (doc.isEmpty()) {
return new DFSA(null);
}
- // TODO get rid of objectbankwrapper
+ // TODO get rid of ObjectBankWrapper
ObjectBankWrapper obw = new ObjectBankWrapper(flags, null, knownLCWords);
doc = obw.processDocument(doc);
SequenceModel model = getSequenceModel(doc);
@@ -441,8 +435,7 @@ public List> classify(String str) {
* Classify the tokens in a String. Each sentence becomes a separate document.
* Doesn't override default readerAndWriter.
*
- * @param str
- * A String with tokens in one or more sentences of text to be
+ * @param str A String with tokens in one or more sentences of text to be
* classified.
* @return {@link List} of classified sentences (each a List of something that
* extends {@link CoreMap}).
@@ -563,7 +556,7 @@ public String classifyToString(String sentences, String outputFormat, boolean pr
plainTextReaderAndWriter.printAnswers(docOutput, pw);
pw.flush();
sb.append(sw.toString());
- sb.append("\n");
+ sb.append('\n');
}
}
return sb.toString();
@@ -791,8 +784,7 @@ public void train(String[] trainFileList,
* Trains a classifier from a Collection of sequences.
* Note that the Collection can be (and usually is) an ObjectBank.
*
- * @param docs
- * An Objectbank or a collection of sequences of IN
+ * @param docs An ObjectBank or a collection of sequences of IN
*/
public void train(Collection> docs) {
train(docs, defaultReaderAndWriter);
@@ -802,10 +794,8 @@ public void train(Collection> docs) {
* Trains a classifier from a Collection of sequences.
* Note that the Collection can be (and usually is) an ObjectBank.
*
- * @param docs
- * An ObjectBank or a collection of sequences of IN
- * @param readerAndWriter
- * A DocumentReaderAndWriter to use when loading test files
+ * @param docs An ObjectBank or a collection of sequences of IN
+ * @param readerAndWriter A DocumentReaderAndWriter to use when loading test files
*/
public abstract void train(Collection> docs,
DocumentReaderAndWriter readerAndWriter);
@@ -858,9 +848,8 @@ public ObjectBank> makeObjectBankFromFiles(String[] trainFileList,
File f = new File(trainFile);
files.add(f);
}
- // System.err.printf("trainFileList contains %d file%s.\n", files.size(),
- // files.size() == 1 ? "": "s");
- // TODO get rid of objectbankwrapper
+ // System.err.printf("trainFileList contains %d file%s in encoding %s.%n", files.size(), files.size() == 1 ? "": "s", flags.inputEncoding);
+ // TODO get rid of ObjectBankWrapper
// return new ObjectBank>(new
// ResettableReaderIteratorFactory(files), readerAndWriter);
return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(files, flags.inputEncoding),
@@ -892,7 +881,7 @@ public ObjectBank> makeObjectBankFromFiles(String baseDir, String fileP
// return new ObjectBank>(new
// ResettableReaderIteratorFactory(files, flags.inputEncoding),
// readerAndWriter);
- // TODO get rid of objectbankwrapper
+ // TODO get rid of ObjectBankWrapper
return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(files,
flags.inputEncoding), readerAndWriter), knownLCWords);
}
@@ -905,7 +894,7 @@ public ObjectBank> makeObjectBankFromFiles(Collection files,
// return new ObjectBank>(new
// ResettableReaderIteratorFactory(files, flags.inputEncoding),
// readerAndWriter);
- // TODO get rid of objectbankwrapper
+ // TODO get rid of ObjectBankWrapper
return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(files,
flags.inputEncoding), readerAndWriter), knownLCWords);
}
@@ -929,7 +918,7 @@ public ObjectBank> makeObjectBankFromReader(BufferedReader in,
if (flags.announceObjectBankEntries) {
System.err.println("Reading data using " + readerAndWriter.getClass());
}
- // TODO get rid of objectbankwrapper
+ // TODO get rid of ObjectBankWrapper
// return new ObjectBank>(new ResettableReaderIteratorFactory(in),
// readerAndWriter);
return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(in),
@@ -956,8 +945,7 @@ public void printProbs(String filename,
* Takes a {@link List} of documents and prints the likelihood of each
* possible label at each point.
*
- * @param documents
- * A {@link List} of {@link List} of something that extends
+ * @param documents A {@link List} of {@link List} of something that extends
* {@link CoreMap}.
*/
public void printProbsDocuments(ObjectBank> documents) {
@@ -1076,9 +1064,9 @@ public void classifyAndWriteAnswers(Collection> documents,
Timing timer = new Timing();
- Counter entityTP = new ClassicCounter();
- Counter entityFP = new ClassicCounter();
- Counter entityFN = new ClassicCounter();
+ Counter entityTP = new ClassicCounter<>();
+ Counter entityFP = new ClassicCounter<>();
+ Counter entityFN = new ClassicCounter<>();
boolean resultsCounted = outputScores;
int numWords = 0;
int numDocs = 0;
@@ -1150,7 +1138,7 @@ public ThreadsafeProcessor, List> newInstance() {
*
* @param testFile The name of the file to test on.
* @param k How many best to print
- * @param readerAndWriter
+ * @param readerAndWriter Class to be used for printing answers
*/
public void classifyAndWriteAnswersKBest(String testFile, int k,
DocumentReaderAndWriter readerAndWriter)
diff --git a/src/edu/stanford/nlp/ie/ClassifierCombiner.java b/src/edu/stanford/nlp/ie/ClassifierCombiner.java
index 05e4859196..73f12e167a 100644
--- a/src/edu/stanford/nlp/ie/ClassifierCombiner.java
+++ b/src/edu/stanford/nlp/ie/ClassifierCombiner.java
@@ -101,28 +101,39 @@ else if((loadPath1 = p.getProperty("loadClassifier")) != null && (loadPath2 = p.
}
}
- /** Loads a series of base classifiers from the paths specified.
+ /** Loads a series of base classifiers from the paths specified using the
+ * Properties specified.
*
- * @param loadPaths Paths to the base classifiers
- * @throws FileNotFoundException If classifier files not found
+ * @param props Properties for the classifier to use (encodings, output format, etc.)
+ * @param combinationMode How to handle multiple classifiers specifying the same entity type
+ * @param loadPaths Paths to the base classifiers
+ * @throws IOException If IO errors in loading classifier files
*/
- public ClassifierCombiner(CombinationMode combinationMode, String... loadPaths) throws IOException {
- super(new Properties());
+ public ClassifierCombiner(Properties props, CombinationMode combinationMode, String... loadPaths) throws IOException {
+ super(props);
this.combinationMode = combinationMode;
List paths = new ArrayList<>(Arrays.asList(loadPaths));
loadClassifiers(paths);
}
+ /** Loads a series of base classifiers from the paths specified using the
+ * Properties specified.
+ *
+ * @param combinationMode How to handle multiple classifiers specifying the same entity type
+ * @param loadPaths Paths to the base classifiers
+ * @throws IOException If IO errors in loading classifier files
+ */
+ public ClassifierCombiner(CombinationMode combinationMode, String... loadPaths) throws IOException {
+ this(new Properties(), combinationMode, loadPaths);
+ }
+
/** Loads a series of base classifiers from the paths specified.
*
* @param loadPaths Paths to the base classifiers
* @throws FileNotFoundException If classifier files not found
*/
public ClassifierCombiner(String... loadPaths) throws IOException {
- super(new Properties());
- this.combinationMode = DEFAULT_COMBINATION_MODE;
- List paths = new ArrayList<>(Arrays.asList(loadPaths));
- loadClassifiers(paths);
+ this(DEFAULT_COMBINATION_MODE, loadPaths);
}
diff --git a/src/edu/stanford/nlp/ie/NERClassifierCombiner.java b/src/edu/stanford/nlp/ie/NERClassifierCombiner.java
index f63a266fe3..e547db3fed 100644
--- a/src/edu/stanford/nlp/ie/NERClassifierCombiner.java
+++ b/src/edu/stanford/nlp/ie/NERClassifierCombiner.java
@@ -5,9 +5,11 @@
import java.util.Properties;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
+import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.DefaultPaths;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.RuntimeInterruptedException;
@@ -64,7 +66,7 @@ public NERClassifierCombiner(boolean applyNumericClassifiers,
String... loadPaths)
throws IOException
{
- super(ClassifierCombiner.extractCombinationModeSafe(nscProps), loadPaths);
+ super(nscProps, ClassifierCombiner.extractCombinationModeSafe(nscProps), loadPaths);
this.applyNumericClassifiers = applyNumericClassifiers;
this.useSUTime = useSUTime;
this.nsc = new NumberSequenceClassifier(new Properties(), useSUTime, nscProps);
@@ -89,6 +91,51 @@ public NERClassifierCombiner(boolean applyNumericClassifiers,
this.nsc = new NumberSequenceClassifier(useSUTime);
}
+ /** This factory method is used to create the NERClassifierCombiner used in NERCombinerAnnotator
+ * (and, thence, in StanfordCoreNLP).
+ *
+ * @param name A "x.y" format property name prefix (the "x" part). This is commonly null,
+ * and then "ner" is used. If it is the empty string, then no property prefix is used.
+ * @param properties Various properties, including a list in "ner.model".
+ * The used ones start with name + "."
+ * @return An NERClassifierCombiner with the given properties
+ */
+ public static NERClassifierCombiner createNERClassifierCombiner(String name, Properties properties) {
+ String prefix = (name != null)? name + '.' : "ner.";
+ String modelNames = properties.getProperty(prefix + "model");
+ if (modelNames == null) {
+ modelNames = DefaultPaths.DEFAULT_NER_THREECLASS_MODEL + ',' + DefaultPaths.DEFAULT_NER_MUC_MODEL + ',' + DefaultPaths.DEFAULT_NER_CONLL_MODEL;
+ }
+ // but modelNames can still be empty string is set explicitly to be empty!
+ String[] models;
+ if ( ! modelNames.isEmpty()) {
+ models = modelNames.split(",");
+ } else {
+ // Allow for no real NER model - can just use numeric classifiers or SUTime
+ System.err.println("WARNING: no NER models specified");
+ models = StringUtils.EMPTY_STRING_ARRAY;
+ }
+ NERClassifierCombiner nerCombiner;
+ try {
+ // TODO: use constants for part after prefix so we can ensure consistent options
+ boolean applyNumericClassifiers =
+ PropertiesUtils.getBool(properties,
+ prefix + "applyNumericClassifiers",
+ APPLY_NUMERIC_CLASSIFIERS_DEFAULT);
+ boolean useSUTime =
+ PropertiesUtils.getBool(properties,
+ prefix + "useSUTime",
+ NumberSequenceClassifier.USE_SUTIME_DEFAULT);
+ // TODO: properties are passed in as is for number sequence classifiers (don't care about the prefix)
+ nerCombiner = new NERClassifierCombiner(applyNumericClassifiers,
+ useSUTime, properties, models);
+ } catch (IOException e) {
+ throw new RuntimeIOException(e);
+ }
+
+ return nerCombiner;
+ }
+
public boolean appliesNumericClassifiers() {
return applyNumericClassifiers;
}
@@ -180,5 +227,17 @@ public void finalizeAnnotation(Annotation annotation) {
nsc.finalizeClassification(annotation);
}
+ /** The main method. Very basic, could usefully be expanded and common code shared with other methods. */
+ public static void main(String[] args) throws Exception {
+ StringUtils.printErrInvocationString("NERClassifierCombiner", args);
+ Properties props = StringUtils.argsToProperties(args);
+ NERClassifierCombiner ncc = createNERClassifierCombiner("", props);
+
+ String textFile = props.getProperty("textFile");
+ if (textFile != null) {
+ ncc.classifyAndWriteAnswers(textFile);
+ }
+ }
+
}
diff --git a/src/edu/stanford/nlp/ie/regexp/NumberSequenceClassifier.java b/src/edu/stanford/nlp/ie/regexp/NumberSequenceClassifier.java
index 1dd3327af8..80803bba16 100644
--- a/src/edu/stanford/nlp/ie/regexp/NumberSequenceClassifier.java
+++ b/src/edu/stanford/nlp/ie/regexp/NumberSequenceClassifier.java
@@ -200,9 +200,10 @@ private List classifyWithSUTime(List tokenSequence, final
}
// everything tagged as CD is also a number
// NumberNormalizer probably catches these but let's be safe
- for(CoreLabel token: tokenSequence) {
- if(token.tag().equals("CD") &&
- token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)){
+ // use inverted "CD".equals() because tag could be null (if no POS info available)
+ for (CoreLabel token: tokenSequence) {
+ if ("CD".equals(token.tag()) &&
+ token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)) {
token.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER");
}
}
@@ -343,9 +344,11 @@ private List runSUTime(CoreMap sentence, final CoreMap document) {
}
/**
- * Recognizes money and percents
- * This accepts currency symbols (e.g., $) both before and after numbers; but it accepts units (e.g., "dollar") only after
- * @param tokenSequence
+ * Recognizes money and percents.
+ * This accepts currency symbols (e.g., $) both before and after numbers; but it accepts units
+ * (e.g., "dollar") only after numbers.
+ *
+ * @param tokenSequence The list of tokens to find money and percents in
*/
private void moneyAndPercentRecognizer(List tokenSequence) {
for(int i = 0; i < tokenSequence.size(); i ++){
@@ -354,8 +357,8 @@ private void moneyAndPercentRecognizer(List tokenSequence) {
CoreLabel prev = (i > 0 ? tokenSequence.get(i - 1) : null);
// $5
- if(CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches() && next != null &&
- (next.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || next.tag().equals("CD"))){
+ if (CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches() && next != null &&
+ (next.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || "CD".equals(next.tag()))) {
crt.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
i = changeLeftToRight(tokenSequence, i + 1,
next.get(CoreAnnotations.AnswerAnnotation.class),
@@ -367,8 +370,8 @@ else if((CURRENCY_WORD_PATTERN.matcher(crt.word()).matches() ||
CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches()) &&
prev != null &&
(prev.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") ||
- prev.tag().equals("CD")) &&
- ! leftScanFindsWeightWord(tokenSequence, i)){
+ "CD".equals(prev.tag())) &&
+ ! leftScanFindsWeightWord(tokenSequence, i)) {
crt.set(CoreAnnotations.AnswerAnnotation.class, "MONEY");
changeRightToLeft(tokenSequence, i - 1,
prev.get(CoreAnnotations.AnswerAnnotation.class),
@@ -376,11 +379,11 @@ else if((CURRENCY_WORD_PATTERN.matcher(crt.word()).matches() ||
}
// 5%, 5 percent
- else if((PERCENT_WORD_PATTERN.matcher(crt.word()).matches() ||
+ else if ((PERCENT_WORD_PATTERN.matcher(crt.word()).matches() ||
PERCENT_SYMBOL_PATTERN.matcher(crt.word()).matches()) &&
prev != null &&
(prev.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") ||
- prev.tag().equals("CD"))){
+ "CD".equals(prev.tag()))) {
crt.set(CoreAnnotations.AnswerAnnotation.class, "PERCENT");
changeRightToLeft(tokenSequence, i - 1,
prev.get(CoreAnnotations.AnswerAnnotation.class),
@@ -581,7 +584,13 @@ private static CoreLabel copyCoreLabel(CoreLabel src, Integer startOffset, Integ
private static final Pattern AM_PM = Pattern.compile("(a\\.?m\\.?)|(p\\.?m\\.?)", Pattern.CASE_INSENSITIVE);
public static final Pattern CURRENCY_WORD_PATTERN = Pattern.compile("(?:dollar|cent|euro|pound)s?|penny|pence|yen|yuan|won", Pattern.CASE_INSENSITIVE);
- public static final Pattern CURRENCY_SYMBOL_PATTERN = Pattern.compile("\\$|£|\u00A3|\u00A5|#|\u20AC|US\\$|HK\\$|A\\$", Pattern.CASE_INSENSITIVE);
+
+ // pattern matches: dollar, pound sign XML escapes; pound sign, yen sign, euro, won; other country dollars; now omit # for pound
+ // TODO: Delete # as currency. But doing this involves changing PTBTokenizer currency normalization rules
+ // Code \u0023 '#' was used for pound '£' in the ISO version of ASCII (ISO 646), and this is found in very old materials
+ // e.g., the 1999 Penn Treebank, but we now don't recognize this, as it now doesn't occur and wrongly recognizes
+ // currency whenever someone refers to the #4 country etc.
+ public static final Pattern CURRENCY_SYMBOL_PATTERN = Pattern.compile("\\$|#|£|£|\u00A3|\u00A5|\u20AC|\u20A9|(?:US|HK|A|C|NT|S|NZ)\\$", Pattern.CASE_INSENSITIVE); // TODO: No longer include archaic # for pound
public static final Pattern ORDINAL_PATTERN = Pattern.compile("(?i)[2-9]?1st|[2-9]?2nd|[2-9]?3rd|1[0-9]th|[2-9]?[04-9]th|100+th|zeroth|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first|fortieth|fiftieth|sixtieth|seventieth|eightieth|ninetieth|hundredth|thousandth|millionth");
diff --git a/src/edu/stanford/nlp/pipeline/NERCombinerAnnotator.java b/src/edu/stanford/nlp/pipeline/NERCombinerAnnotator.java
index 27127dd334..9d200e7593 100644
--- a/src/edu/stanford/nlp/pipeline/NERCombinerAnnotator.java
+++ b/src/edu/stanford/nlp/pipeline/NERCombinerAnnotator.java
@@ -2,13 +2,11 @@
import edu.stanford.nlp.ie.NERClassifierCombiner;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
-import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.RuntimeInterruptedException;
-import edu.stanford.nlp.util.StringUtils;
import java.io.IOException;
import java.util.*;
@@ -64,48 +62,11 @@ public NERCombinerAnnotator(NERClassifierCombiner ner, boolean verbose, int nThr
}
public NERCombinerAnnotator(String name, Properties properties) {
- this(createNERClassifierCombiner(name, properties), false,
+ this(NERClassifierCombiner.createNERClassifierCombiner(name, properties), false,
PropertiesUtils.getInt(properties, name + ".nthreads", PropertiesUtils.getInt(properties, "nthreads", 1)),
PropertiesUtils.getLong(properties, name + ".maxtime", -1));
}
- static NERClassifierCombiner createNERClassifierCombiner(String name, Properties properties) {
- // TODO: Move function into NERClassifierCombiner?
- String prefix = (name != null)? name + '.' : "ner.";
- String modelNames = properties.getProperty(prefix + "model");
- if (modelNames == null) {
- modelNames = DefaultPaths.DEFAULT_NER_THREECLASS_MODEL + ',' + DefaultPaths.DEFAULT_NER_MUC_MODEL + ',' + DefaultPaths.DEFAULT_NER_CONLL_MODEL;
- }
- // but modelNames can still be empty string is set explicitly to be empty!
- String[] models;
- if ( ! modelNames.isEmpty()) {
- models = modelNames.split(",");
- } else {
- // Allow for no real NER model - can just use numeric classifiers or SUTime
- System.err.println("WARNING: no NER models specified");
- models = StringUtils.EMPTY_STRING_ARRAY;
- }
- NERClassifierCombiner nerCombiner;
- try {
- // TODO: use constants for part after prefix so we can ensure consistent options
- boolean applyNumericClassifiers =
- PropertiesUtils.getBool(properties,
- prefix + "applyNumericClassifiers",
- NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_DEFAULT);
- boolean useSUTime =
- PropertiesUtils.getBool(properties,
- prefix + "useSUTime",
- NumberSequenceClassifier.USE_SUTIME_DEFAULT);
- // TODO: properties are passed in as is for number sequence classifiers (don't care about the prefix)
- nerCombiner = new NERClassifierCombiner(applyNumericClassifiers,
- useSUTime, properties, models);
- } catch (IOException e) {
- throw new RuntimeIOException(e);
- }
-
- return nerCombiner;
- }
-
@Override
protected int nThreads() {
return nThreads;