From 0d9e9c829bfa75bb661cccea03fc682a0f955f0d Mon Sep 17 00:00:00 2001
From: John Bauer <horatio@gmail.com>
Date: Wed, 13 Jul 2022 18:08:25 -0700
Subject: [PATCH] Trim words - doing this instead of splitting on all
 whitespace gives us a chance of getting VI right

---
 .../nlp/sequences/ColumnDocumentReaderAndWriter.java | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)
diff --git a/src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java b/src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java
index cb538fac2f..a496020152 100644
--- a/src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java
+++ b/src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java
@@ -13,8 +13,9 @@
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.objectbank.DelimitRegExIterator;
 import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
-import java.util.function.Function;
+import edu.stanford.nlp.util.ArrayUtils;
 import edu.stanford.nlp.util.StringUtils;
+import java.util.function.Function;
 
 
 /**
@@ -34,6 +35,7 @@ public class ColumnDocumentReaderAndWriter implements DocumentReaderAndWriter<Co
   //map can be something like "word=0,tag=1,answer=2"
   @SuppressWarnings("rawtypes")
   private Class[] map; // = null;
+  private int wordColumn = -1;
   private IteratorFromReaderFactory<List<CoreLabel>> factory;
 
 //  public void init(SeqClassifierFlags flags) {
@@ -51,6 +53,7 @@ public void init(SeqClassifierFlags flags) {
   public void init(String map) {
     // this.flags = null;
     this.map = CoreLabel.parseStringKeys(StringUtils.mapStringToArray(map));
+    this.wordColumn = ArrayUtils.indexOf(this.map, CoreAnnotations.TextAnnotation.class);
     factory = DelimitRegExIterator.getFactory("\n(?:\\s*\n)+", new ColumnDocParser());
   }
 
@@ -87,6 +90,13 @@ public List<CoreLabel> apply(String doc) {
         if (info.length == 1) {
           info = whitePattern.split(line);
         }
+        // Trimming later rather than splitting on all whitespace
+        // gives us the possibility of tokens with whitespace in them
+        // although obviously not at the start or end...
+        // doesn't slow the classifier down too much
+        if (wordColumn >= 0) {
+          info[wordColumn] = info[wordColumn].trim();
+        }
         CoreLabel wi;
         try {
           wi = new CoreLabel(map, info);