Skip NBSP when reading characters, just like other whitespace characters

Adds a test that spaces and NBSP get the right character offsets in the segmenter annotator
stanfordnlp · Jul 2, 2022 · 71283cd · 71283cd
1 parent 7c84960
commit 71283cd
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 3 deletions.
diff --git a/itest/src/edu/stanford/nlp/pipeline/ChineseSegmenterAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/ChineseSegmenterAnnotatorITest.java
@@ -38,6 +38,18 @@ public void testPipeline() {
         new int[]{0, 1, 3, 5, 7, 8},
         new int[]{1, 3, 5, 7, 8, 9});
 
+    // test that it does something reasonable with spaces
+    testOne("我在 加州 工作 ",
+            new String[]{"我", "在", "加州", "工作"},
+            new int[]{0, 1, 3, 6},
+            new int[]{1, 2, 5, 8});
+
+    // test that it does something reasonable with NBSP
+    testOne("我在 加州 工作 ",
+            new String[]{"我", "在", "加州", "工作"},
+            new int[]{0, 1, 3, 6},
+            new int[]{1, 2, 5, 8});
+
     // All of the tools should now produce () instead of -LRB- -RRB-
     testOne("你马上回来(北京)吗？",
             new String[]{"你", "马上", "回来", "(", "北京", ")", "吗", "？"},

diff --git a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
@@ -689,6 +689,9 @@ public List<String> segmentString(String sentence, DocumentReaderAndWriter<IN> r
     if (segmented.length() == 0) {
       return Collections.emptyList();
     } else {
+      // \\p{Zs} would catch more whitespace options than \\s,
+      // but hopefully the upstream segmentation handled
+      // unusual whitespace such as NBSP already
       return Arrays.asList(segmented.split("\\s"));
     }
   }

diff --git a/src/edu/stanford/nlp/wordseg/Sighan2005DocumentReaderAndWriter.java b/src/edu/stanford/nlp/wordseg/Sighan2005DocumentReaderAndWriter.java
@@ -160,7 +160,7 @@ public List<CoreLabel> apply(String line) {
         // TODO: can double chars be whitespace / isocontrol?
         int codePoint = Character.codePointAt(line, index);
         CoreLabel wi = new CoreLabel();
-        if ( ! Character.isWhitespace(codePoint) && ! Character.isISOControl(codePoint)) {
+        if (!Character.isWhitespace(codePoint) && !Character.isISOControl(codePoint) && !Character.isSpaceChar(codePoint)) {
           boolean surrogate = Character.isSupplementaryCodePoint(codePoint);
           String wordString;
           if (surrogate) {
@@ -171,7 +171,7 @@ public List<CoreLabel> apply(String line) {
           wi.set(CoreAnnotations.CharAnnotation.class, intern(wordString));
 
           // non-breaking space is skipped as well
-          while (Character.isWhitespace(origLine.charAt(origIndex)) || Character.isISOControl(origLine.charAt(origIndex)) || (origLine.charAt(origIndex) == '\u00A0')) {
+          while (Character.isWhitespace(origLine.charAt(origIndex)) || Character.isISOControl(origLine.charAt(origIndex)) || Character.isSpaceChar(origLine.charAt(origIndex))) {
             origIndex++;
           }
 
@@ -197,7 +197,7 @@ public List<CoreLabel> apply(String line) {
             wi.set(CoreAnnotations.AnswerAnnotation.class, "1");
             wi.set(CoreAnnotations.SpaceBeforeAnnotation.class, "1");
             wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "1");
-          } else if (Character.isWhitespace(line.charAt(index - 1)) || Character.isISOControl(line.charAt(index - 1))) {
+          } else if (Character.isWhitespace(line.charAt(index - 1)) || Character.isISOControl(line.charAt(index - 1)) || Character.isSpaceChar(line.charAt(index - 1))) {
             wi.set(CoreAnnotations.AnswerAnnotation.class, "1");
             wi.set(CoreAnnotations.SpaceBeforeAnnotation.class, "1");
             wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "1");