diff --git a/JavaNLP-core.iml b/JavaNLP-core.iml index 07016325ce..fb08673c08 100644 --- a/JavaNLP-core.iml +++ b/JavaNLP-core.iml @@ -1,5 +1,5 @@ - + @@ -311,5 +311,4 @@ - - + \ No newline at end of file diff --git a/itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java b/itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java index 47c2e401d6..47ad36b348 100644 --- a/itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java +++ b/itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java @@ -36,30 +36,30 @@ private static void checkLabels(StanfordCoreNLP pipe, String text, String [] lab CoreMap sent = doc.get(CoreAnnotations.SentencesAnnotation.class).get(0); assertTrue(sent.get(CoreAnnotations.TokensAnnotation.class) != null); List tokens = sent.get(CoreAnnotations.TokensAnnotation.class); - if(VERBOSE){ + if (VERBOSE) { for(CoreLabel token: tokens) { - System.out.println("\t" + token.word() + " " + - token.tag() + " " + - token.ner() + " " + + System.out.println('\t' + token.word() + ' ' + + token.tag() + ' ' + + token.ner() + ' ' + (token.containsKey(CoreAnnotations.NumericCompositeTypeAnnotation.class) ? token.get(CoreAnnotations.NumericCompositeValueAnnotation.class) + " " : "") + - (token.containsKey(TimeAnnotations.TimexAnnotation.class) ? token.get(TimeAnnotations.TimexAnnotation.class) + " " : "")); + (token.containsKey(TimeAnnotations.TimexAnnotation.class) ? token.get(TimeAnnotations.TimexAnnotation.class) + " " : "")); } } - + // check NER labels assertTrue(tokens.size() == labels.length); - for(int i = 0; i < labels.length; i ++){ + for (int i = 0; i < labels.length; i ++) { if(labels[i] == null){ assertTrue(tokens.get(i).ner() == null); } else { Pattern p = Pattern.compile(labels[i]); System.err.println("COMPARING NER " + labels[i] + " with " + tokens.get(i).ner()); System.err.flush(); - assertTrue(tokens.get(i).ner() != null); + assertTrue("NER should not be null for token " + tokens.get(i) + " in sentence " + tokens, tokens.get(i).ner() != null); assertTrue(tokens.get(i).ner() + " does not match " + p + " for token " + tokens.get(i) + " in sentence " + tokens, p.matcher(tokens.get(i).ner()).matches()); } } - + // check normalized values, if gold is given if(normed != null){ assertTrue(tokens.size() == normed.length); @@ -70,8 +70,8 @@ private static void checkLabels(StanfordCoreNLP pipe, String text, String [] lab Pattern p = Pattern.compile(normed[i]); String n = tokens.get(i).get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class); String message = "COMPARING NORMED \"" + normed[i] + "\" with \"" + n + "\""; - assertTrue(message, n != null); - assertTrue(message, p.matcher(n).matches()); + assertTrue(message + "; latter should not be null", n != null); + assertTrue(message + "; latter should match", p.matcher(n).matches()); } } } @@ -83,8 +83,8 @@ private static void run(String header, String [] texts, String [][] answers, Str if(VERBOSE) { System.out.println("Running test " + header + " for text: " + texts[i]); } - checkLabels(pipe, - texts[i], + checkLabels(pipe, + texts[i], answers[i], normed != null ? normed[i] : null); } @@ -100,10 +100,10 @@ private static void run(String header, String [] texts, String [][] answers, Str "It cost four million dollars", "It cost $1m", "It cost 50 cents", - "It cost # 1500", + "It cost £ 1500", "It cost \u00A3 1500", "It cost \u00A3 .50", - "It cost # .50", + "It cost € .50", "It cost $ 1500", "It cost $1500", "It cost $ 1,500", @@ -111,8 +111,10 @@ private static void run(String header, String [] texts, String [][] answers, Str "It cost $48.75", "It cost $ 57.60", "It cost $8 thousand", - "It cost $42,33" + "It cost $42,33", +// "It cost ₩1500", // TODO: Add won symbol to PTBTokenizer }; + private static final String [][] moneyAnswers = { { null, null, "MONEY", "MONEY" }, { null, null, "MONEY", "MONEY" }, @@ -134,8 +136,10 @@ private static void run(String header, String [] texts, String [][] answers, Str { null, null, "MONEY", "MONEY" }, { null, null, "MONEY", "MONEY" }, { null, null, "MONEY", "MONEY", "MONEY" }, - { null, null, "MONEY", "MONEY" } + { null, null, "MONEY", "MONEY" }, +// { null, null, "MONEY", "MONEY" }, }; + private static final String [][] moneyNormed = { { null, null, "\\$5.0", "\\$5.0" }, { null, null, "\\$0.24", "\\$0.24" }, @@ -149,7 +153,7 @@ private static void run(String header, String [] texts, String [][] answers, Str { null, null, "\u00A31500.0", "\u00A31500.0" }, { null, null, "\u00A31500.0", "\u00A31500.0" }, { null, null, "\u00A30.5", "\u00A30.5" }, - { null, null, "\u00A30.5", "\u00A30.5" }, + { null, null, "\\$0.5", "\\$0.5" }, // TODO: Fix PTBTokenizer to really normalize it to Euro € { null, null, "\\$1500.0", "\\$1500.0" }, { null, null, "\\$1500.0", "\\$1500.0" }, { null, null, "\\$1500.0", "\\$1500.0" }, @@ -157,8 +161,10 @@ private static void run(String header, String [] texts, String [][] answers, Str { null, null, "\\$48.75", "\\$48.75" }, { null, null, "\\$57.6", "\\$57.6" }, { null, null, "\\$8000.0", "\\$8000.0", "\\$8000.0" }, - { null, null, "\\$4233.0", "\\$4233.0" } + { null, null, "\\$4233.0", "\\$4233.0" }, +// { null, null, "₩4233.0", "₩4233.0" }, }; + public void testMoney() { run("MONEY", moneyStrings, moneyAnswers, moneyNormed); } @@ -185,7 +191,7 @@ public void testMoney() { { null, null, null, "1000.0", null }, }; public void testOrdinal() { - run("ORDINAL", ordinalStrings, ordinalAnswers, ordinalNormed); + run("ORDINAL", ordinalStrings, ordinalAnswers, ordinalNormed); } private static final String [] dateStrings = { @@ -243,7 +249,7 @@ public void testOrdinal() { { "2008-06-06" , "2008-06-06", "2008-06-06", null, "2008-06-07" , "2008-06-07", "2008-06-07" }, }; public void testDate() { - run("DATE", dateStrings, dateAnswers, dateNormed); + run("DATE", dateStrings, dateAnswers, dateNormed); } private static final String [] numberStrings = { @@ -280,9 +286,9 @@ public void testDate() { { "801.0", null, "123.0", null } }; public void testNumber() { - run("NUMBER", numberStrings, numberAnswers, numberNormed); + run("NUMBER", numberStrings, numberAnswers, numberNormed); } - + private static final String [] timeStrings = { "the time was 10:20", "12:29 p.m.", diff --git a/itest/src/edu/stanford/nlp/pipeline/NERCombinerAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/NERCombinerAnnotatorITest.java index fd56fc70e1..1030faa4c9 100644 --- a/itest/src/edu/stanford/nlp/pipeline/NERCombinerAnnotatorITest.java +++ b/itest/src/edu/stanford/nlp/pipeline/NERCombinerAnnotatorITest.java @@ -13,7 +13,7 @@ import java.util.Properties; -/** +/** * @author Angel Chang * @author John Bauer */ @@ -23,9 +23,9 @@ public class NERCombinerAnnotatorITest extends TestCase { public static final String NER_7CLASS = DefaultPaths.DEFAULT_NER_MUC_MODEL; public static final String NER_MISCCLASS = DefaultPaths.DEFAULT_NER_CONLL_MODEL; - static NERCombinerAnnotator nerAnnotator = null; - static AnnotationPipeline unthreadedPipeline = null; - static AnnotationPipeline threaded4Pipeline = null; + private static NERCombinerAnnotator nerAnnotator = null; + private static AnnotationPipeline unthreadedPipeline = null; + private static AnnotationPipeline threaded4Pipeline = null; /** * Creates the tagger annotator if it isn't already created @@ -42,7 +42,7 @@ public void setUp() props.setProperty("ner.applyNumericClassifiers", "false"); props.setProperty("ner.useSUTime", "false"); props.setProperty("ner.model", NER_3CLASS); - NERClassifierCombiner ner = NERCombinerAnnotator.createNERClassifierCombiner("ner", props); + NERClassifierCombiner ner = NERClassifierCombiner.createNERClassifierCombiner("ner", props); NERCombinerAnnotator threaded4Annotator = new NERCombinerAnnotator(ner, false, 4, -1); threaded4Pipeline = new AnnotationPipeline(); @@ -75,7 +75,7 @@ public void testThreadedAnnotator() { verifyAnswers(ANSWERS, document); } - public void verifyAnswers(String[][] expected, Annotation document) { + public static void verifyAnswers(String[][] expected, Annotation document) { int sentenceIndex = 0; for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) { List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class); @@ -100,7 +100,7 @@ public void verifyAnswers(String[][] expected, Annotation document) { private static Iterator getTestData(String inputString, boolean includeAnswer) { - ColumnTabDocumentReaderWriter colReader = new ColumnTabDocumentReaderWriter(); + ColumnTabDocumentReaderWriter colReader = new ColumnTabDocumentReaderWriter<>(); if (includeAnswer) { colReader.init("word=0,tag=1,answer=2"); } else { diff --git a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java index 4b9e541980..7de1948549 100644 --- a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java +++ b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java @@ -108,8 +108,7 @@ public DocumentReaderAndWriter plainTextReaderAndWriter() { * Construct a SeqClassifierFlags object based on the passed in properties, * and then call the other constructor. * - * @param props - * See SeqClassifierFlags for known properties. + * @param props See SeqClassifierFlags for known properties. */ public AbstractSequenceClassifier(Properties props) { this(new SeqClassifierFlags(props)); @@ -124,14 +123,13 @@ public AbstractSequenceClassifier(Properties props) { public AbstractSequenceClassifier(SeqClassifierFlags flags) { this.flags = flags; - // try { // Thang Sep13: allow for multiple feature factories. this.featureFactories = Generics.newArrayList(); if (flags.featureFactory != null) { FeatureFactory factory = new MetaClass(flags.featureFactory).createInstance(flags.featureFactoryArgs); // for compatibility featureFactories.add(factory); } - if(flags.featureFactories!=null){ + if (flags.featureFactories != null) { for (int i = 0; i < flags.featureFactories.length; i++) { FeatureFactory indFeatureFactory = new MetaClass(flags.featureFactories[i]). createInstance(flags.featureFactoriesArgs.get(i)); @@ -142,11 +140,7 @@ public AbstractSequenceClassifier(SeqClassifierFlags flags) { tokenFactory = (CoreTokenFactory) new CoreLabelTokenFactory(); } else { this.tokenFactory = new MetaClass(flags.tokenFactory).createInstance(flags.tokenFactoryArgs); - // this.tokenFactory = (CoreTokenFactory) Class.forName(flags.tokenFactory).newInstance(); } - // } catch (Exception e) { - // throw new RuntimeException(e); - // } pad = tokenFactory.makeToken(); windowSize = flags.maxLeft + 1; reinit(); @@ -281,7 +275,7 @@ public List classifySentence(List sentence) { i++; } - // TODO get rid of objectbankwrapper + // TODO get rid of ObjectBankWrapper ObjectBankWrapper wrapper = new ObjectBankWrapper(flags, null, knownLCWords); wrapper.processDocument(document); @@ -320,7 +314,7 @@ public List classifySentenceWithGlobalInformation(List to i++; } - // TODO get rid of objectbankwrapper + // TODO get rid of ObjectBankWrapper ObjectBankWrapper wrapper = new ObjectBankWrapper(flags, null, knownLCWords); wrapper.processDocument(document); @@ -402,7 +396,7 @@ public DFSA getViterbiSearchGraph(List doc, Class(null); } - // TODO get rid of objectbankwrapper + // TODO get rid of ObjectBankWrapper ObjectBankWrapper obw = new ObjectBankWrapper(flags, null, knownLCWords); doc = obw.processDocument(doc); SequenceModel model = getSequenceModel(doc); @@ -441,8 +435,7 @@ public List> classify(String str) { * Classify the tokens in a String. Each sentence becomes a separate document. * Doesn't override default readerAndWriter. * - * @param str - * A String with tokens in one or more sentences of text to be + * @param str A String with tokens in one or more sentences of text to be * classified. * @return {@link List} of classified sentences (each a List of something that * extends {@link CoreMap}). @@ -563,7 +556,7 @@ public String classifyToString(String sentences, String outputFormat, boolean pr plainTextReaderAndWriter.printAnswers(docOutput, pw); pw.flush(); sb.append(sw.toString()); - sb.append("\n"); + sb.append('\n'); } } return sb.toString(); @@ -791,8 +784,7 @@ public void train(String[] trainFileList, * Trains a classifier from a Collection of sequences. * Note that the Collection can be (and usually is) an ObjectBank. * - * @param docs - * An Objectbank or a collection of sequences of IN + * @param docs An ObjectBank or a collection of sequences of IN */ public void train(Collection> docs) { train(docs, defaultReaderAndWriter); @@ -802,10 +794,8 @@ public void train(Collection> docs) { * Trains a classifier from a Collection of sequences. * Note that the Collection can be (and usually is) an ObjectBank. * - * @param docs - * An ObjectBank or a collection of sequences of IN - * @param readerAndWriter - * A DocumentReaderAndWriter to use when loading test files + * @param docs An ObjectBank or a collection of sequences of IN + * @param readerAndWriter A DocumentReaderAndWriter to use when loading test files */ public abstract void train(Collection> docs, DocumentReaderAndWriter readerAndWriter); @@ -858,9 +848,8 @@ public ObjectBank> makeObjectBankFromFiles(String[] trainFileList, File f = new File(trainFile); files.add(f); } - // System.err.printf("trainFileList contains %d file%s.\n", files.size(), - // files.size() == 1 ? "": "s"); - // TODO get rid of objectbankwrapper + // System.err.printf("trainFileList contains %d file%s in encoding %s.%n", files.size(), files.size() == 1 ? "": "s", flags.inputEncoding); + // TODO get rid of ObjectBankWrapper // return new ObjectBank>(new // ResettableReaderIteratorFactory(files), readerAndWriter); return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(files, flags.inputEncoding), @@ -892,7 +881,7 @@ public ObjectBank> makeObjectBankFromFiles(String baseDir, String fileP // return new ObjectBank>(new // ResettableReaderIteratorFactory(files, flags.inputEncoding), // readerAndWriter); - // TODO get rid of objectbankwrapper + // TODO get rid of ObjectBankWrapper return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(files, flags.inputEncoding), readerAndWriter), knownLCWords); } @@ -905,7 +894,7 @@ public ObjectBank> makeObjectBankFromFiles(Collection files, // return new ObjectBank>(new // ResettableReaderIteratorFactory(files, flags.inputEncoding), // readerAndWriter); - // TODO get rid of objectbankwrapper + // TODO get rid of ObjectBankWrapper return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(files, flags.inputEncoding), readerAndWriter), knownLCWords); } @@ -929,7 +918,7 @@ public ObjectBank> makeObjectBankFromReader(BufferedReader in, if (flags.announceObjectBankEntries) { System.err.println("Reading data using " + readerAndWriter.getClass()); } - // TODO get rid of objectbankwrapper + // TODO get rid of ObjectBankWrapper // return new ObjectBank>(new ResettableReaderIteratorFactory(in), // readerAndWriter); return new ObjectBankWrapper(flags, new ObjectBank>(new ResettableReaderIteratorFactory(in), @@ -956,8 +945,7 @@ public void printProbs(String filename, * Takes a {@link List} of documents and prints the likelihood of each * possible label at each point. * - * @param documents - * A {@link List} of {@link List} of something that extends + * @param documents A {@link List} of {@link List} of something that extends * {@link CoreMap}. */ public void printProbsDocuments(ObjectBank> documents) { @@ -1076,9 +1064,9 @@ public void classifyAndWriteAnswers(Collection> documents, Timing timer = new Timing(); - Counter entityTP = new ClassicCounter(); - Counter entityFP = new ClassicCounter(); - Counter entityFN = new ClassicCounter(); + Counter entityTP = new ClassicCounter<>(); + Counter entityFP = new ClassicCounter<>(); + Counter entityFN = new ClassicCounter<>(); boolean resultsCounted = outputScores; int numWords = 0; int numDocs = 0; @@ -1150,7 +1138,7 @@ public ThreadsafeProcessor, List> newInstance() { * * @param testFile The name of the file to test on. * @param k How many best to print - * @param readerAndWriter + * @param readerAndWriter Class to be used for printing answers */ public void classifyAndWriteAnswersKBest(String testFile, int k, DocumentReaderAndWriter readerAndWriter) diff --git a/src/edu/stanford/nlp/ie/ClassifierCombiner.java b/src/edu/stanford/nlp/ie/ClassifierCombiner.java index 05e4859196..73f12e167a 100644 --- a/src/edu/stanford/nlp/ie/ClassifierCombiner.java +++ b/src/edu/stanford/nlp/ie/ClassifierCombiner.java @@ -101,28 +101,39 @@ else if((loadPath1 = p.getProperty("loadClassifier")) != null && (loadPath2 = p. } } - /** Loads a series of base classifiers from the paths specified. + /** Loads a series of base classifiers from the paths specified using the + * Properties specified. * - * @param loadPaths Paths to the base classifiers - * @throws FileNotFoundException If classifier files not found + * @param props Properties for the classifier to use (encodings, output format, etc.) + * @param combinationMode How to handle multiple classifiers specifying the same entity type + * @param loadPaths Paths to the base classifiers + * @throws IOException If IO errors in loading classifier files */ - public ClassifierCombiner(CombinationMode combinationMode, String... loadPaths) throws IOException { - super(new Properties()); + public ClassifierCombiner(Properties props, CombinationMode combinationMode, String... loadPaths) throws IOException { + super(props); this.combinationMode = combinationMode; List paths = new ArrayList<>(Arrays.asList(loadPaths)); loadClassifiers(paths); } + /** Loads a series of base classifiers from the paths specified using the + * Properties specified. + * + * @param combinationMode How to handle multiple classifiers specifying the same entity type + * @param loadPaths Paths to the base classifiers + * @throws IOException If IO errors in loading classifier files + */ + public ClassifierCombiner(CombinationMode combinationMode, String... loadPaths) throws IOException { + this(new Properties(), combinationMode, loadPaths); + } + /** Loads a series of base classifiers from the paths specified. * * @param loadPaths Paths to the base classifiers * @throws FileNotFoundException If classifier files not found */ public ClassifierCombiner(String... loadPaths) throws IOException { - super(new Properties()); - this.combinationMode = DEFAULT_COMBINATION_MODE; - List paths = new ArrayList<>(Arrays.asList(loadPaths)); - loadClassifiers(paths); + this(DEFAULT_COMBINATION_MODE, loadPaths); } diff --git a/src/edu/stanford/nlp/ie/NERClassifierCombiner.java b/src/edu/stanford/nlp/ie/NERClassifierCombiner.java index f63a266fe3..e547db3fed 100644 --- a/src/edu/stanford/nlp/ie/NERClassifierCombiner.java +++ b/src/edu/stanford/nlp/ie/NERClassifierCombiner.java @@ -5,9 +5,11 @@ import java.util.Properties; import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier; +import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.DefaultPaths; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.RuntimeInterruptedException; @@ -64,7 +66,7 @@ public NERClassifierCombiner(boolean applyNumericClassifiers, String... loadPaths) throws IOException { - super(ClassifierCombiner.extractCombinationModeSafe(nscProps), loadPaths); + super(nscProps, ClassifierCombiner.extractCombinationModeSafe(nscProps), loadPaths); this.applyNumericClassifiers = applyNumericClassifiers; this.useSUTime = useSUTime; this.nsc = new NumberSequenceClassifier(new Properties(), useSUTime, nscProps); @@ -89,6 +91,51 @@ public NERClassifierCombiner(boolean applyNumericClassifiers, this.nsc = new NumberSequenceClassifier(useSUTime); } + /** This factory method is used to create the NERClassifierCombiner used in NERCombinerAnnotator + * (and, thence, in StanfordCoreNLP). + * + * @param name A "x.y" format property name prefix (the "x" part). This is commonly null, + * and then "ner" is used. If it is the empty string, then no property prefix is used. + * @param properties Various properties, including a list in "ner.model". + * The used ones start with name + "." + * @return An NERClassifierCombiner with the given properties + */ + public static NERClassifierCombiner createNERClassifierCombiner(String name, Properties properties) { + String prefix = (name != null)? name + '.' : "ner."; + String modelNames = properties.getProperty(prefix + "model"); + if (modelNames == null) { + modelNames = DefaultPaths.DEFAULT_NER_THREECLASS_MODEL + ',' + DefaultPaths.DEFAULT_NER_MUC_MODEL + ',' + DefaultPaths.DEFAULT_NER_CONLL_MODEL; + } + // but modelNames can still be empty string is set explicitly to be empty! + String[] models; + if ( ! modelNames.isEmpty()) { + models = modelNames.split(","); + } else { + // Allow for no real NER model - can just use numeric classifiers or SUTime + System.err.println("WARNING: no NER models specified"); + models = StringUtils.EMPTY_STRING_ARRAY; + } + NERClassifierCombiner nerCombiner; + try { + // TODO: use constants for part after prefix so we can ensure consistent options + boolean applyNumericClassifiers = + PropertiesUtils.getBool(properties, + prefix + "applyNumericClassifiers", + APPLY_NUMERIC_CLASSIFIERS_DEFAULT); + boolean useSUTime = + PropertiesUtils.getBool(properties, + prefix + "useSUTime", + NumberSequenceClassifier.USE_SUTIME_DEFAULT); + // TODO: properties are passed in as is for number sequence classifiers (don't care about the prefix) + nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, + useSUTime, properties, models); + } catch (IOException e) { + throw new RuntimeIOException(e); + } + + return nerCombiner; + } + public boolean appliesNumericClassifiers() { return applyNumericClassifiers; } @@ -180,5 +227,17 @@ public void finalizeAnnotation(Annotation annotation) { nsc.finalizeClassification(annotation); } + /** The main method. Very basic, could usefully be expanded and common code shared with other methods. */ + public static void main(String[] args) throws Exception { + StringUtils.printErrInvocationString("NERClassifierCombiner", args); + Properties props = StringUtils.argsToProperties(args); + NERClassifierCombiner ncc = createNERClassifierCombiner("", props); + + String textFile = props.getProperty("textFile"); + if (textFile != null) { + ncc.classifyAndWriteAnswers(textFile); + } + } + } diff --git a/src/edu/stanford/nlp/ie/regexp/NumberSequenceClassifier.java b/src/edu/stanford/nlp/ie/regexp/NumberSequenceClassifier.java index 1dd3327af8..80803bba16 100644 --- a/src/edu/stanford/nlp/ie/regexp/NumberSequenceClassifier.java +++ b/src/edu/stanford/nlp/ie/regexp/NumberSequenceClassifier.java @@ -200,9 +200,10 @@ private List classifyWithSUTime(List tokenSequence, final } // everything tagged as CD is also a number // NumberNormalizer probably catches these but let's be safe - for(CoreLabel token: tokenSequence) { - if(token.tag().equals("CD") && - token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)){ + // use inverted "CD".equals() because tag could be null (if no POS info available) + for (CoreLabel token: tokenSequence) { + if ("CD".equals(token.tag()) && + token.get(CoreAnnotations.AnswerAnnotation.class).equals(flags.backgroundSymbol)) { token.set(CoreAnnotations.AnswerAnnotation.class, "NUMBER"); } } @@ -343,9 +344,11 @@ private List runSUTime(CoreMap sentence, final CoreMap document) { } /** - * Recognizes money and percents - * This accepts currency symbols (e.g., $) both before and after numbers; but it accepts units (e.g., "dollar") only after - * @param tokenSequence + * Recognizes money and percents. + * This accepts currency symbols (e.g., $) both before and after numbers; but it accepts units + * (e.g., "dollar") only after numbers. + * + * @param tokenSequence The list of tokens to find money and percents in */ private void moneyAndPercentRecognizer(List tokenSequence) { for(int i = 0; i < tokenSequence.size(); i ++){ @@ -354,8 +357,8 @@ private void moneyAndPercentRecognizer(List tokenSequence) { CoreLabel prev = (i > 0 ? tokenSequence.get(i - 1) : null); // $5 - if(CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches() && next != null && - (next.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || next.tag().equals("CD"))){ + if (CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches() && next != null && + (next.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || "CD".equals(next.tag()))) { crt.set(CoreAnnotations.AnswerAnnotation.class, "MONEY"); i = changeLeftToRight(tokenSequence, i + 1, next.get(CoreAnnotations.AnswerAnnotation.class), @@ -367,8 +370,8 @@ else if((CURRENCY_WORD_PATTERN.matcher(crt.word()).matches() || CURRENCY_SYMBOL_PATTERN.matcher(crt.word()).matches()) && prev != null && (prev.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || - prev.tag().equals("CD")) && - ! leftScanFindsWeightWord(tokenSequence, i)){ + "CD".equals(prev.tag())) && + ! leftScanFindsWeightWord(tokenSequence, i)) { crt.set(CoreAnnotations.AnswerAnnotation.class, "MONEY"); changeRightToLeft(tokenSequence, i - 1, prev.get(CoreAnnotations.AnswerAnnotation.class), @@ -376,11 +379,11 @@ else if((CURRENCY_WORD_PATTERN.matcher(crt.word()).matches() || } // 5%, 5 percent - else if((PERCENT_WORD_PATTERN.matcher(crt.word()).matches() || + else if ((PERCENT_WORD_PATTERN.matcher(crt.word()).matches() || PERCENT_SYMBOL_PATTERN.matcher(crt.word()).matches()) && prev != null && (prev.get(CoreAnnotations.AnswerAnnotation.class).equals("NUMBER") || - prev.tag().equals("CD"))){ + "CD".equals(prev.tag()))) { crt.set(CoreAnnotations.AnswerAnnotation.class, "PERCENT"); changeRightToLeft(tokenSequence, i - 1, prev.get(CoreAnnotations.AnswerAnnotation.class), @@ -581,7 +584,13 @@ private static CoreLabel copyCoreLabel(CoreLabel src, Integer startOffset, Integ private static final Pattern AM_PM = Pattern.compile("(a\\.?m\\.?)|(p\\.?m\\.?)", Pattern.CASE_INSENSITIVE); public static final Pattern CURRENCY_WORD_PATTERN = Pattern.compile("(?:dollar|cent|euro|pound)s?|penny|pence|yen|yuan|won", Pattern.CASE_INSENSITIVE); - public static final Pattern CURRENCY_SYMBOL_PATTERN = Pattern.compile("\\$|£|\u00A3|\u00A5|#|\u20AC|US\\$|HK\\$|A\\$", Pattern.CASE_INSENSITIVE); + + // pattern matches: dollar, pound sign XML escapes; pound sign, yen sign, euro, won; other country dollars; now omit # for pound + // TODO: Delete # as currency. But doing this involves changing PTBTokenizer currency normalization rules + // Code \u0023 '#' was used for pound '£' in the ISO version of ASCII (ISO 646), and this is found in very old materials + // e.g., the 1999 Penn Treebank, but we now don't recognize this, as it now doesn't occur and wrongly recognizes + // currency whenever someone refers to the #4 country etc. + public static final Pattern CURRENCY_SYMBOL_PATTERN = Pattern.compile("\\$|#|£|£|\u00A3|\u00A5|\u20AC|\u20A9|(?:US|HK|A|C|NT|S|NZ)\\$", Pattern.CASE_INSENSITIVE); // TODO: No longer include archaic # for pound public static final Pattern ORDINAL_PATTERN = Pattern.compile("(?i)[2-9]?1st|[2-9]?2nd|[2-9]?3rd|1[0-9]th|[2-9]?[04-9]th|100+th|zeroth|first|second|third|fourth|fifth|sixth|seventh|eighth|ninth|tenth|eleventh|twelfth|thirteenth|fourteenth|fifteenth|sixteenth|seventeenth|eighteenth|nineteenth|twentieth|twenty-first|twenty-second|twenty-third|twenty-fourth|twenty-fifth|twenty-sixth|twenty-seventh|twenty-eighth|twenty-ninth|thirtieth|thirty-first|fortieth|fiftieth|sixtieth|seventieth|eightieth|ninetieth|hundredth|thousandth|millionth"); diff --git a/src/edu/stanford/nlp/pipeline/NERCombinerAnnotator.java b/src/edu/stanford/nlp/pipeline/NERCombinerAnnotator.java index 27127dd334..9d200e7593 100644 --- a/src/edu/stanford/nlp/pipeline/NERCombinerAnnotator.java +++ b/src/edu/stanford/nlp/pipeline/NERCombinerAnnotator.java @@ -2,13 +2,11 @@ import edu.stanford.nlp.ie.NERClassifierCombiner; import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier; -import edu.stanford.nlp.io.RuntimeIOException; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.RuntimeInterruptedException; -import edu.stanford.nlp.util.StringUtils; import java.io.IOException; import java.util.*; @@ -64,48 +62,11 @@ public NERCombinerAnnotator(NERClassifierCombiner ner, boolean verbose, int nThr } public NERCombinerAnnotator(String name, Properties properties) { - this(createNERClassifierCombiner(name, properties), false, + this(NERClassifierCombiner.createNERClassifierCombiner(name, properties), false, PropertiesUtils.getInt(properties, name + ".nthreads", PropertiesUtils.getInt(properties, "nthreads", 1)), PropertiesUtils.getLong(properties, name + ".maxtime", -1)); } - static NERClassifierCombiner createNERClassifierCombiner(String name, Properties properties) { - // TODO: Move function into NERClassifierCombiner? - String prefix = (name != null)? name + '.' : "ner."; - String modelNames = properties.getProperty(prefix + "model"); - if (modelNames == null) { - modelNames = DefaultPaths.DEFAULT_NER_THREECLASS_MODEL + ',' + DefaultPaths.DEFAULT_NER_MUC_MODEL + ',' + DefaultPaths.DEFAULT_NER_CONLL_MODEL; - } - // but modelNames can still be empty string is set explicitly to be empty! - String[] models; - if ( ! modelNames.isEmpty()) { - models = modelNames.split(","); - } else { - // Allow for no real NER model - can just use numeric classifiers or SUTime - System.err.println("WARNING: no NER models specified"); - models = StringUtils.EMPTY_STRING_ARRAY; - } - NERClassifierCombiner nerCombiner; - try { - // TODO: use constants for part after prefix so we can ensure consistent options - boolean applyNumericClassifiers = - PropertiesUtils.getBool(properties, - prefix + "applyNumericClassifiers", - NERClassifierCombiner.APPLY_NUMERIC_CLASSIFIERS_DEFAULT); - boolean useSUTime = - PropertiesUtils.getBool(properties, - prefix + "useSUTime", - NumberSequenceClassifier.USE_SUTIME_DEFAULT); - // TODO: properties are passed in as is for number sequence classifiers (don't care about the prefix) - nerCombiner = new NERClassifierCombiner(applyNumericClassifiers, - useSUTime, properties, models); - } catch (IOException e) { - throw new RuntimeIOException(e); - } - - return nerCombiner; - } - @Override protected int nThreads() { return nThreads;