Skip to content

Commit

Permalink
Skip NBSP when reading characters, just like other whitespace characters
Browse files Browse the repository at this point in the history
Adds a test that spaces and NBSP get the right character offsets in the segmenter annotator
  • Loading branch information
AngledLuffa committed Jul 2, 2022
1 parent 7c84960 commit 71283cd
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@ public void testPipeline() {
new int[]{0, 1, 3, 5, 7, 8},
new int[]{1, 3, 5, 7, 8, 9});

// test that it does something reasonable with spaces
testOne("我在 加州 工作 ",
new String[]{"我", "在", "加州", "工作"},
new int[]{0, 1, 3, 6},
new int[]{1, 2, 5, 8});

// test that it does something reasonable with NBSP
testOne("我在 加州 工作 ",
new String[]{"我", "在", "加州", "工作"},
new int[]{0, 1, 3, 6},
new int[]{1, 2, 5, 8});

// All of the tools should now produce () instead of -LRB- -RRB-
testOne("你马上回来(北京)吗?",
new String[]{"你", "马上", "回来", "(", "北京", ")", "吗", "?"},
Expand Down
3 changes: 3 additions & 0 deletions src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,9 @@ public List<String> segmentString(String sentence, DocumentReaderAndWriter<IN> r
if (segmented.length() == 0) {
return Collections.emptyList();
} else {
// \\p{Zs} would catch more whitespace options than \\s,
// but hopefully the upstream segmentation handled
// unusual whitespace such as NBSP already
return Arrays.asList(segmented.split("\\s"));
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ public List<CoreLabel> apply(String line) {
// TODO: can double chars be whitespace / isocontrol?
int codePoint = Character.codePointAt(line, index);
CoreLabel wi = new CoreLabel();
if ( ! Character.isWhitespace(codePoint) && ! Character.isISOControl(codePoint)) {
if (!Character.isWhitespace(codePoint) && !Character.isISOControl(codePoint) && !Character.isSpaceChar(codePoint)) {
boolean surrogate = Character.isSupplementaryCodePoint(codePoint);
String wordString;
if (surrogate) {
Expand All @@ -171,7 +171,7 @@ public List<CoreLabel> apply(String line) {
wi.set(CoreAnnotations.CharAnnotation.class, intern(wordString));

// non-breaking space is skipped as well
while (Character.isWhitespace(origLine.charAt(origIndex)) || Character.isISOControl(origLine.charAt(origIndex)) || (origLine.charAt(origIndex) == '\u00A0')) {
while (Character.isWhitespace(origLine.charAt(origIndex)) || Character.isISOControl(origLine.charAt(origIndex)) || Character.isSpaceChar(origLine.charAt(origIndex))) {
origIndex++;
}

Expand All @@ -197,7 +197,7 @@ public List<CoreLabel> apply(String line) {
wi.set(CoreAnnotations.AnswerAnnotation.class, "1");
wi.set(CoreAnnotations.SpaceBeforeAnnotation.class, "1");
wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "1");
} else if (Character.isWhitespace(line.charAt(index - 1)) || Character.isISOControl(line.charAt(index - 1))) {
} else if (Character.isWhitespace(line.charAt(index - 1)) || Character.isISOControl(line.charAt(index - 1)) || Character.isSpaceChar(line.charAt(index - 1))) {
wi.set(CoreAnnotations.AnswerAnnotation.class, "1");
wi.set(CoreAnnotations.SpaceBeforeAnnotation.class, "1");
wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "1");
Expand Down

0 comments on commit 71283cd

Please sign in to comment.