Skip to content

Commit

Permalink
add regexner
Browse files Browse the repository at this point in the history
  • Loading branch information
J38 authored and Stanford NLP committed Jun 15, 2017
1 parent 8c98e2d commit e5d3ca7
Show file tree
Hide file tree
Showing 24 changed files with 74,985 additions and 71,979 deletions.

This file was deleted.

103 changes: 9 additions & 94 deletions src/edu/stanford/nlp/ie/NumberNormalizer.java
Expand Up @@ -88,7 +88,6 @@ public static void setVerbose(boolean verbose) {
// Converts numbers in words to numeric form
// works through trillions
private static final Pattern digitsPattern = Pattern.compile("\\d+");
private static final Pattern digitsPatternExtended = Pattern.compile("(\\d+\\.?\\d*)(dozen|score|hundred|thousand|million|billion|trillion)?"); // this is really just second-guessing the tokenizer
private static final Pattern numPattern = Pattern.compile("[-+]?(?:\\d+(?:,\\d\\d\\d)*(?:\\.\\d*)?|\\.\\d+)");
private static final Pattern numRangePattern = Pattern.compile("(" + numPattern.pattern() + ")-(" + numPattern.pattern() + ")");
// private static final Pattern[] endUnitWordsPattern = new Pattern[endUnitWords.length];
Expand Down Expand Up @@ -202,93 +201,6 @@ public static void setVerbose(boolean verbose) {
private static final Pattern alphaPattern = Pattern.compile("([a-zA-Z]+)");
private static final Pattern wsPattern = Pattern.compile("\\s+");

/**
* All the different shitty forms of unicode whitespace.
*/
private static final String whitespaceCharsRegex = "[" /* dummy empty string for homogeneity */
+ "\\u0009" // CHARACTER TABULATION
+ "\\u000A" // LINE FEED (LF)
+ "\\u000B" // LINE TABULATION
+ "\\u000C" // FORM FEED (FF)
+ "\\u000D" // CARRIAGE RETURN (CR)
+ "\\u0020" // SPACE
+ "\\u0085" // NEXT LINE (NEL)
+ "\\u00A0" // NO-BREAK SPACE
+ "\\u1680" // OGHAM SPACE MARK
+ "\\u180E" // MONGOLIAN VOWEL SEPARATOR
+ "\\u2000" // EN QUAD
+ "\\u2001" // EM QUAD
+ "\\u2002" // EN SPACE
+ "\\u2003" // EM SPACE
+ "\\u2004" // THREE-PER-EM SPACE
+ "\\u2005" // FOUR-PER-EM SPACE
+ "\\u2006" // SIX-PER-EM SPACE
+ "\\u2007" // FIGURE SPACE
+ "\\u2008" // PUNCTUATION SPACE
+ "\\u2009" // THIN SPACE
+ "\\u200A" // HAIR SPACE
+ "\\u2028" // LINE SEPARATOR
+ "\\u2029" // PARAGRAPH SEPARATOR
+ "\\u202F" // NARROW NO-BREAK SPACE
+ "\\u205F" // MEDIUM MATHEMATICAL SPACE
+ "\\u3000" // IDEOGRAPHIC SPACE
+ "]"
;



private static Number parseNumberPart(String input, String originalString, int curIndex) {
Matcher matcher = digitsPatternExtended.matcher(input);
if (matcher.matches()) {
String numPart = matcher.group(1);
String magnitudePart = matcher.group(2);
if (magnitudePart != null) {
long magnitude = 1;
switch (magnitudePart.toLowerCase()) {
case "dozen":
magnitude = 12L;
break;
case "score":
magnitude = 20L;
break;
case "hundred":
magnitude = 100L;
break;
case "thousand":
magnitude = 1000L;
break;
case "million":
magnitude = 1000000L;
break;
case "billion":
magnitude = 1000000000L;
break;
case "trillion":
magnitude = 1000000000000L;
break;
default:
// unknown magnitude! Ignore it.
break;
}
if (digitsPattern.matcher(numPart).matches()) {
return Long.parseLong(numPart) * magnitude;
} else {
return Double.parseDouble(numPart) * magnitude;
}
} else {
if (digitsPattern.matcher(numPart).matches()) {
return Long.parseLong(numPart);
} else {
return Double.parseDouble(numPart);
}
}
} else{
throw new NumberFormatException("Bad number put into wordToNumber. Word is: \"" + input + "\", originally part of \"" + originalString + "\", piece # " + curIndex);
}

}


/**
* Fairly generous utility function to convert a string representing
* a number (hopefully) to a Number.
Expand Down Expand Up @@ -351,7 +263,7 @@ public static Number wordToNumber(String str) {

// get numeric value of each word piece
for (int curIndex = 0; curIndex < numWords; curIndex++) {
String curPart = fields[curIndex] == null ? "" : fields[curIndex].replaceAll(whitespaceCharsRegex + "+", "").trim();
String curPart = fields[curIndex];
Matcher m = alphaPattern.matcher(curPart);
if (m.find()) {
// Some part of the word has alpha characters
Expand All @@ -374,18 +286,21 @@ public static Number wordToNumber(String str) {
}
} else if (Character.isDigit(curPart.charAt(0))) {
if (curPart.endsWith("th") || curPart.endsWith("rd") || curPart.endsWith("nd") || curPart.endsWith("st")) {
curPart = curPart.substring(0, curPart.length()-2).trim();
curPart = curPart.substring(0, curPart.length()-2);
}
if (digitsPattern.matcher(curPart).matches()) {
curNum = Long.parseLong(curPart);
} else{
throw new NumberFormatException("Bad number put into wordToNumber. Word is: \"" + curPart + "\", originally part of \"" + originalString + "\", piece # " + curIndex);
}
curNum = parseNumberPart(curPart, originalString, curIndex);
} else {
throw new NumberFormatException("Bad number put into wordToNumber. Word is: \"" + curPart + "\", originally part of \"" + originalString + "\", piece # " + curIndex);
}
numFields[curIndex] = curNum;
} else {
// Word is all numeric
Matcher matcher = digitsPatternExtended.matcher(curPart);
if (matcher.matches()) {
numFields[curIndex] = parseNumberPart(curPart, originalString, curIndex);
if (digitsPattern.matcher(curPart).matches()) {
numFields[curIndex] = Long.parseLong(curPart);
} else if (numPattern.matcher(curPart).matches()) {
numFields[curIndex] = new BigDecimal(curPart);
} else {
Expand Down
13 changes: 7 additions & 6 deletions src/edu/stanford/nlp/ie/machinereading/GenericDataSetReader.java
@@ -1,4 +1,4 @@
package edu.stanford.nlp.ie.machinereading;
package edu.stanford.nlp.ie.machinereading;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.IOException;
Expand Down Expand Up @@ -135,7 +135,8 @@ public final Annotation parse(String path) throws IOException {
//
retVal = this.read(path);
} catch (Exception ex) {
IOException iox = new IOException(ex);
IOException iox = new IOException();
iox.initCause(ex);
throw iox;
}

Expand All @@ -148,8 +149,8 @@ public final Annotation parse(String path) throws IOException {
}
return retVal;
}

private static void modifyUsingCoreNLPNER(Annotation doc) {
private void modifyUsingCoreNLPNER(Annotation doc) {
Properties ann = new Properties();
ann.setProperty("annotators", "pos, lemma, ner");
StanfordCoreNLP pipeline = new StanfordCoreNLP(ann, false);
Expand All @@ -170,7 +171,7 @@ private static void modifyUsingCoreNLPNER(Annotation doc) {
//System.out.println("new ner tag is " + entityNertag);
}
}

}
}

Expand Down Expand Up @@ -501,7 +502,7 @@ private static CoreLabel initCoreLabel(String token) {
label.setValue(token);
label.set(CoreAnnotations.TextAnnotation.class, token);
label.set(CoreAnnotations.ValueAnnotation.class, token);

return label;
}

Expand Down
8 changes: 2 additions & 6 deletions src/edu/stanford/nlp/ling/CoreAnnotations.java
Expand Up @@ -427,7 +427,7 @@ public Class<String> getType() {

/**
* Annotation for the whitespace characters appearing before this word. This
* can be filled in by an invertible tokenizer so that the original text string can be
* can be filled in by the tokenizer so that the original text string can be
* reconstructed.
*/
public static class BeforeAnnotation implements CoreAnnotation<String> {
Expand All @@ -439,12 +439,8 @@ public Class<String> getType() {

/**
* Annotation for the whitespace characters appear after this word. This can
* be filled in by an invertible tokenizer so that the original text string can be
* be filled in by the tokenizer so that the original text string can be
* reconstructed.
*
* Note: When running a tokenizer token-by-token, in general this field will only
* be filled in after the next token is read, so you need to be reading this field
* one behind. Be careful about this.
*/
public static class AfterAnnotation implements CoreAnnotation<String> {
@Override
Expand Down

0 comments on commit e5d3ca7

Please sign in to comment.