From 0d9e9c829bfa75bb661cccea03fc682a0f955f0d Mon Sep 17 00:00:00 2001 From: John Bauer Date: Wed, 13 Jul 2022 18:08:25 -0700 Subject: [PATCH] Trim words - doing this instead of splitting on all whitespace gives us a chance of getting VI right --- .../nlp/sequences/ColumnDocumentReaderAndWriter.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java b/src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java index cb538fac2f..a496020152 100644 --- a/src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java +++ b/src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java @@ -13,8 +13,9 @@ import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.objectbank.DelimitRegExIterator; import edu.stanford.nlp.objectbank.IteratorFromReaderFactory; -import java.util.function.Function; +import edu.stanford.nlp.util.ArrayUtils; import edu.stanford.nlp.util.StringUtils; +import java.util.function.Function; /** @@ -34,6 +35,7 @@ public class ColumnDocumentReaderAndWriter implements DocumentReaderAndWriter> factory; // public void init(SeqClassifierFlags flags) { @@ -51,6 +53,7 @@ public void init(SeqClassifierFlags flags) { public void init(String map) { // this.flags = null; this.map = CoreLabel.parseStringKeys(StringUtils.mapStringToArray(map)); + this.wordColumn = ArrayUtils.indexOf(this.map, CoreAnnotations.TextAnnotation.class); factory = DelimitRegExIterator.getFactory("\n(?:\\s*\n)+", new ColumnDocParser()); } @@ -87,6 +90,13 @@ public List apply(String doc) { if (info.length == 1) { info = whitePattern.split(line); } + // Trimming later rather than splitting on all whitespace + // gives us the possibility of tokens with whitespace in them + // although obviously not at the start or end... + // doesn't slow the classifier down too much + if (wordColumn >= 0) { + info[wordColumn] = info[wordColumn].trim(); + } CoreLabel wi; try { wi = new CoreLabel(map, info);