Skip to content

Commit

Permalink
Merge branch 'master' into gm-quote
Browse files Browse the repository at this point in the history
  • Loading branch information
Grace Muzny authored and Stanford NLP committed Feb 16, 2015
1 parent 8820df2 commit 931c483
Show file tree
Hide file tree
Showing 8 changed files with 161 additions and 128 deletions.
5 changes: 2 additions & 3 deletions JavaNLP-core.iml
@@ -1,5 +1,5 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="JAVA_MODULE" version="4">
<module classpath="eclipse" classpath-dir="$MODULE_DIR$" type="JAVA_MODULE" version="4">
<component name="EclipseModuleManager">
<libelement value="jar://$MODULE_DIR$/lib/ant-contrib-1.0b3.jar!/" />
<libelement value="jar://$MODULE_DIR$/lib/tomcat/el-api.jar!/" />
Expand Down Expand Up @@ -311,5 +311,4 @@
</library>
</orderEntry>
</component>
</module>

</module>
52 changes: 29 additions & 23 deletions itest/src/edu/stanford/nlp/ie/NumberSequenceClassifierITest.java
Expand Up @@ -36,30 +36,30 @@ private static void checkLabels(StanfordCoreNLP pipe, String text, String [] lab
CoreMap sent = doc.get(CoreAnnotations.SentencesAnnotation.class).get(0);
assertTrue(sent.get(CoreAnnotations.TokensAnnotation.class) != null);
List<CoreLabel> tokens = sent.get(CoreAnnotations.TokensAnnotation.class);
if(VERBOSE){
if (VERBOSE) {
for(CoreLabel token: tokens) {
System.out.println("\t" + token.word() + " " +
token.tag() + " " +
token.ner() + " " +
System.out.println('\t' + token.word() + ' ' +
token.tag() + ' ' +
token.ner() + ' ' +
(token.containsKey(CoreAnnotations.NumericCompositeTypeAnnotation.class) ? token.get(CoreAnnotations.NumericCompositeValueAnnotation.class) + " " : "") +
(token.containsKey(TimeAnnotations.TimexAnnotation.class) ? token.get(TimeAnnotations.TimexAnnotation.class) + " " : ""));
(token.containsKey(TimeAnnotations.TimexAnnotation.class) ? token.get(TimeAnnotations.TimexAnnotation.class) + " " : ""));
}
}

// check NER labels
assertTrue(tokens.size() == labels.length);
for(int i = 0; i < labels.length; i ++){
for (int i = 0; i < labels.length; i ++) {
if(labels[i] == null){
assertTrue(tokens.get(i).ner() == null);
} else {
Pattern p = Pattern.compile(labels[i]);
System.err.println("COMPARING NER " + labels[i] + " with " + tokens.get(i).ner());
System.err.flush();
assertTrue(tokens.get(i).ner() != null);
assertTrue("NER should not be null for token " + tokens.get(i) + " in sentence " + tokens, tokens.get(i).ner() != null);
assertTrue(tokens.get(i).ner() + " does not match " + p + " for token " + tokens.get(i) + " in sentence " + tokens, p.matcher(tokens.get(i).ner()).matches());
}
}

// check normalized values, if gold is given
if(normed != null){
assertTrue(tokens.size() == normed.length);
Expand All @@ -70,8 +70,8 @@ private static void checkLabels(StanfordCoreNLP pipe, String text, String [] lab
Pattern p = Pattern.compile(normed[i]);
String n = tokens.get(i).get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
String message = "COMPARING NORMED \"" + normed[i] + "\" with \"" + n + "\"";
assertTrue(message, n != null);
assertTrue(message, p.matcher(n).matches());
assertTrue(message + "; latter should not be null", n != null);
assertTrue(message + "; latter should match", p.matcher(n).matches());
}
}
}
Expand All @@ -83,8 +83,8 @@ private static void run(String header, String [] texts, String [][] answers, Str
if(VERBOSE) {
System.out.println("Running test " + header + " for text: " + texts[i]);
}
checkLabels(pipe,
texts[i],
checkLabels(pipe,
texts[i],
answers[i],
normed != null ? normed[i] : null);
}
Expand All @@ -100,19 +100,21 @@ private static void run(String header, String [] texts, String [][] answers, Str
"It cost four million dollars",
"It cost $1m",
"It cost 50 cents",
"It cost # 1500",
"It cost £ 1500",
"It cost \u00A3 1500",
"It cost \u00A3 .50",
"It cost # .50",
"It cost .50",
"It cost $ 1500",
"It cost $1500",
"It cost $ 1,500",
"It cost $1,500",
"It cost $48.75",
"It cost $ 57.60",
"It cost $8 thousand",
"It cost $42,33"
"It cost $42,33",
// "It cost ₩1500", // TODO: Add won symbol to PTBTokenizer
};

private static final String [][] moneyAnswers = {
{ null, null, "MONEY", "MONEY" },
{ null, null, "MONEY", "MONEY" },
Expand All @@ -134,8 +136,10 @@ private static void run(String header, String [] texts, String [][] answers, Str
{ null, null, "MONEY", "MONEY" },
{ null, null, "MONEY", "MONEY" },
{ null, null, "MONEY", "MONEY", "MONEY" },
{ null, null, "MONEY", "MONEY" }
{ null, null, "MONEY", "MONEY" },
// { null, null, "MONEY", "MONEY" },
};

private static final String [][] moneyNormed = {
{ null, null, "\\$5.0", "\\$5.0" },
{ null, null, "\\$0.24", "\\$0.24" },
Expand All @@ -149,16 +153,18 @@ private static void run(String header, String [] texts, String [][] answers, Str
{ null, null, "\u00A31500.0", "\u00A31500.0" },
{ null, null, "\u00A31500.0", "\u00A31500.0" },
{ null, null, "\u00A30.5", "\u00A30.5" },
{ null, null, "\u00A30.5", "\u00A30.5" },
{ null, null, "\\$0.5", "\\$0.5" }, // TODO: Fix PTBTokenizer to really normalize it to Euro €
{ null, null, "\\$1500.0", "\\$1500.0" },
{ null, null, "\\$1500.0", "\\$1500.0" },
{ null, null, "\\$1500.0", "\\$1500.0" },
{ null, null, "\\$1500.0", "\\$1500.0" },
{ null, null, "\\$48.75", "\\$48.75" },
{ null, null, "\\$57.6", "\\$57.6" },
{ null, null, "\\$8000.0", "\\$8000.0", "\\$8000.0" },
{ null, null, "\\$4233.0", "\\$4233.0" }
{ null, null, "\\$4233.0", "\\$4233.0" },
// { null, null, "₩4233.0", "₩4233.0" },
};

public void testMoney() {
run("MONEY", moneyStrings, moneyAnswers, moneyNormed);
}
Expand All @@ -185,7 +191,7 @@ public void testMoney() {
{ null, null, null, "1000.0", null },
};
public void testOrdinal() {
run("ORDINAL", ordinalStrings, ordinalAnswers, ordinalNormed);
run("ORDINAL", ordinalStrings, ordinalAnswers, ordinalNormed);
}

private static final String [] dateStrings = {
Expand Down Expand Up @@ -243,7 +249,7 @@ public void testOrdinal() {
{ "2008-06-06" , "2008-06-06", "2008-06-06", null, "2008-06-07" , "2008-06-07", "2008-06-07" },
};
public void testDate() {
run("DATE", dateStrings, dateAnswers, dateNormed);
run("DATE", dateStrings, dateAnswers, dateNormed);
}

private static final String [] numberStrings = {
Expand Down Expand Up @@ -280,9 +286,9 @@ public void testDate() {
{ "801.0", null, "123.0", null }
};
public void testNumber() {
run("NUMBER", numberStrings, numberAnswers, numberNormed);
run("NUMBER", numberStrings, numberAnswers, numberNormed);
}

private static final String [] timeStrings = {
"the time was 10:20",
"12:29 p.m.",
Expand Down
Expand Up @@ -13,7 +13,7 @@
import java.util.Properties;


/**
/**
* @author Angel Chang
* @author John Bauer
*/
Expand All @@ -23,9 +23,9 @@ public class NERCombinerAnnotatorITest extends TestCase {
public static final String NER_7CLASS = DefaultPaths.DEFAULT_NER_MUC_MODEL;
public static final String NER_MISCCLASS = DefaultPaths.DEFAULT_NER_CONLL_MODEL;

static NERCombinerAnnotator nerAnnotator = null;
static AnnotationPipeline unthreadedPipeline = null;
static AnnotationPipeline threaded4Pipeline = null;
private static NERCombinerAnnotator nerAnnotator = null;
private static AnnotationPipeline unthreadedPipeline = null;
private static AnnotationPipeline threaded4Pipeline = null;

/**
* Creates the tagger annotator if it isn't already created
Expand All @@ -42,7 +42,7 @@ public void setUp()
props.setProperty("ner.applyNumericClassifiers", "false");
props.setProperty("ner.useSUTime", "false");
props.setProperty("ner.model", NER_3CLASS);
NERClassifierCombiner ner = NERCombinerAnnotator.createNERClassifierCombiner("ner", props);
NERClassifierCombiner ner = NERClassifierCombiner.createNERClassifierCombiner("ner", props);
NERCombinerAnnotator threaded4Annotator = new NERCombinerAnnotator(ner, false, 4, -1);

threaded4Pipeline = new AnnotationPipeline();
Expand Down Expand Up @@ -75,7 +75,7 @@ public void testThreadedAnnotator() {
verifyAnswers(ANSWERS, document);
}

public void verifyAnswers(String[][] expected, Annotation document) {
public static void verifyAnswers(String[][] expected, Annotation document) {
int sentenceIndex = 0;
for (CoreMap sentence : document.get(CoreAnnotations.SentencesAnnotation.class)) {
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
Expand All @@ -100,7 +100,7 @@ public void verifyAnswers(String[][] expected, Annotation document) {

private static Iterator<Annotation> getTestData(String inputString, boolean includeAnswer)
{
ColumnTabDocumentReaderWriter colReader = new ColumnTabDocumentReaderWriter();
ColumnTabDocumentReaderWriter<CoreMap> colReader = new ColumnTabDocumentReaderWriter<>();
if (includeAnswer) {
colReader.init("word=0,tag=1,answer=2");
} else {
Expand Down
52 changes: 20 additions & 32 deletions src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
Expand Up @@ -108,8 +108,7 @@ public DocumentReaderAndWriter<IN> plainTextReaderAndWriter() {
* Construct a SeqClassifierFlags object based on the passed in properties,
* and then call the other constructor.
*
* @param props
* See SeqClassifierFlags for known properties.
* @param props See SeqClassifierFlags for known properties.
*/
public AbstractSequenceClassifier(Properties props) {
this(new SeqClassifierFlags(props));
Expand All @@ -124,14 +123,13 @@ public AbstractSequenceClassifier(Properties props) {
public AbstractSequenceClassifier(SeqClassifierFlags flags) {
this.flags = flags;

// try {
// Thang Sep13: allow for multiple feature factories.
this.featureFactories = Generics.newArrayList();
if (flags.featureFactory != null) {
FeatureFactory<IN> factory = new MetaClass(flags.featureFactory).createInstance(flags.featureFactoryArgs); // for compatibility
featureFactories.add(factory);
}
if(flags.featureFactories!=null){
if (flags.featureFactories != null) {
for (int i = 0; i < flags.featureFactories.length; i++) {
FeatureFactory<IN> indFeatureFactory = new MetaClass(flags.featureFactories[i]).
createInstance(flags.featureFactoriesArgs.get(i));
Expand All @@ -142,11 +140,7 @@ public AbstractSequenceClassifier(SeqClassifierFlags flags) {
tokenFactory = (CoreTokenFactory<IN>) new CoreLabelTokenFactory();
} else {
this.tokenFactory = new MetaClass(flags.tokenFactory).createInstance(flags.tokenFactoryArgs);
// this.tokenFactory = (CoreTokenFactory<IN>) Class.forName(flags.tokenFactory).newInstance();
}
// } catch (Exception e) {
// throw new RuntimeException(e);
// }
pad = tokenFactory.makeToken();
windowSize = flags.maxLeft + 1;
reinit();
Expand Down Expand Up @@ -281,7 +275,7 @@ public List<IN> classifySentence(List<? extends HasWord> sentence) {
i++;
}

// TODO get rid of objectbankwrapper
// TODO get rid of ObjectBankWrapper
ObjectBankWrapper<IN> wrapper = new ObjectBankWrapper<IN>(flags, null, knownLCWords);
wrapper.processDocument(document);

Expand Down Expand Up @@ -320,7 +314,7 @@ public List<IN> classifySentenceWithGlobalInformation(List<? extends HasWord> to
i++;
}

// TODO get rid of objectbankwrapper
// TODO get rid of ObjectBankWrapper
ObjectBankWrapper<IN> wrapper = new ObjectBankWrapper<IN>(flags, null, knownLCWords);
wrapper.processDocument(document);

Expand Down Expand Up @@ -402,7 +396,7 @@ public DFSA<String, Integer> getViterbiSearchGraph(List<IN> doc, Class<? extends
if (doc.isEmpty()) {
return new DFSA<String, Integer>(null);
}
// TODO get rid of objectbankwrapper
// TODO get rid of ObjectBankWrapper
ObjectBankWrapper<IN> obw = new ObjectBankWrapper<IN>(flags, null, knownLCWords);
doc = obw.processDocument(doc);
SequenceModel model = getSequenceModel(doc);
Expand Down Expand Up @@ -441,8 +435,7 @@ public List<List<IN>> classify(String str) {
* Classify the tokens in a String. Each sentence becomes a separate document.
* Doesn't override default readerAndWriter.
*
* @param str
* A String with tokens in one or more sentences of text to be
* @param str A String with tokens in one or more sentences of text to be
* classified.
* @return {@link List} of classified sentences (each a List of something that
* extends {@link CoreMap}).
Expand Down Expand Up @@ -563,7 +556,7 @@ public String classifyToString(String sentences, String outputFormat, boolean pr
plainTextReaderAndWriter.printAnswers(docOutput, pw);
pw.flush();
sb.append(sw.toString());
sb.append("\n");
sb.append('\n');
}
}
return sb.toString();
Expand Down Expand Up @@ -791,8 +784,7 @@ public void train(String[] trainFileList,
* Trains a classifier from a Collection of sequences.
* Note that the Collection can be (and usually is) an ObjectBank.
*
* @param docs
* An Objectbank or a collection of sequences of IN
* @param docs An ObjectBank or a collection of sequences of IN
*/
public void train(Collection<List<IN>> docs) {
train(docs, defaultReaderAndWriter);
Expand All @@ -802,10 +794,8 @@ public void train(Collection<List<IN>> docs) {
* Trains a classifier from a Collection of sequences.
* Note that the Collection can be (and usually is) an ObjectBank.
*
* @param docs
* An ObjectBank or a collection of sequences of IN
* @param readerAndWriter
* A DocumentReaderAndWriter to use when loading test files
* @param docs An ObjectBank or a collection of sequences of IN
* @param readerAndWriter A DocumentReaderAndWriter to use when loading test files
*/
public abstract void train(Collection<List<IN>> docs,
DocumentReaderAndWriter<IN> readerAndWriter);
Expand Down Expand Up @@ -858,9 +848,8 @@ public ObjectBank<List<IN>> makeObjectBankFromFiles(String[] trainFileList,
File f = new File(trainFile);
files.add(f);
}
// System.err.printf("trainFileList contains %d file%s.\n", files.size(),
// files.size() == 1 ? "": "s");
// TODO get rid of objectbankwrapper
// System.err.printf("trainFileList contains %d file%s in encoding %s.%n", files.size(), files.size() == 1 ? "": "s", flags.inputEncoding);
// TODO get rid of ObjectBankWrapper
// return new ObjectBank<List<IN>>(new
// ResettableReaderIteratorFactory(files), readerAndWriter);
return new ObjectBankWrapper<IN>(flags, new ObjectBank<List<IN>>(new ResettableReaderIteratorFactory(files, flags.inputEncoding),
Expand Down Expand Up @@ -892,7 +881,7 @@ public ObjectBank<List<IN>> makeObjectBankFromFiles(String baseDir, String fileP
// return new ObjectBank<List<IN>>(new
// ResettableReaderIteratorFactory(files, flags.inputEncoding),
// readerAndWriter);
// TODO get rid of objectbankwrapper
// TODO get rid of ObjectBankWrapper
return new ObjectBankWrapper<IN>(flags, new ObjectBank<List<IN>>(new ResettableReaderIteratorFactory(files,
flags.inputEncoding), readerAndWriter), knownLCWords);
}
Expand All @@ -905,7 +894,7 @@ public ObjectBank<List<IN>> makeObjectBankFromFiles(Collection<File> files,
// return new ObjectBank<List<IN>>(new
// ResettableReaderIteratorFactory(files, flags.inputEncoding),
// readerAndWriter);
// TODO get rid of objectbankwrapper
// TODO get rid of ObjectBankWrapper
return new ObjectBankWrapper<IN>(flags, new ObjectBank<List<IN>>(new ResettableReaderIteratorFactory(files,
flags.inputEncoding), readerAndWriter), knownLCWords);
}
Expand All @@ -929,7 +918,7 @@ public ObjectBank<List<IN>> makeObjectBankFromReader(BufferedReader in,
if (flags.announceObjectBankEntries) {
System.err.println("Reading data using " + readerAndWriter.getClass());
}
// TODO get rid of objectbankwrapper
// TODO get rid of ObjectBankWrapper
// return new ObjectBank<List<IN>>(new ResettableReaderIteratorFactory(in),
// readerAndWriter);
return new ObjectBankWrapper<IN>(flags, new ObjectBank<List<IN>>(new ResettableReaderIteratorFactory(in),
Expand All @@ -956,8 +945,7 @@ public void printProbs(String filename,
* Takes a {@link List} of documents and prints the likelihood of each
* possible label at each point.
*
* @param documents
* A {@link List} of {@link List} of something that extends
* @param documents A {@link List} of {@link List} of something that extends
* {@link CoreMap}.
*/
public void printProbsDocuments(ObjectBank<List<IN>> documents) {
Expand Down Expand Up @@ -1076,9 +1064,9 @@ public void classifyAndWriteAnswers(Collection<List<IN>> documents,

Timing timer = new Timing();

Counter<String> entityTP = new ClassicCounter<String>();
Counter<String> entityFP = new ClassicCounter<String>();
Counter<String> entityFN = new ClassicCounter<String>();
Counter<String> entityTP = new ClassicCounter<>();
Counter<String> entityFP = new ClassicCounter<>();
Counter<String> entityFN = new ClassicCounter<>();
boolean resultsCounted = outputScores;
int numWords = 0;
int numDocs = 0;
Expand Down Expand Up @@ -1150,7 +1138,7 @@ public ThreadsafeProcessor<List<IN>, List<IN>> newInstance() {
*
* @param testFile The name of the file to test on.
* @param k How many best to print
* @param readerAndWriter
* @param readerAndWriter Class to be used for printing answers
*/
public void classifyAndWriteAnswersKBest(String testFile, int k,
DocumentReaderAndWriter<IN> readerAndWriter)
Expand Down

0 comments on commit 931c483

Please sign in to comment.