Skip to content

Commit

Permalink
Trim words - doing this instead of splitting on all whitespace gives …
Browse files Browse the repository at this point in the history
…us a chance of getting VI right
  • Loading branch information
AngledLuffa committed Jul 14, 2022
1 parent 6193934 commit 0d9e9c8
Showing 1 changed file with 11 additions and 1 deletion.
Expand Up @@ -13,8 +13,9 @@
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.objectbank.DelimitRegExIterator;
import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
import java.util.function.Function;
import edu.stanford.nlp.util.ArrayUtils;
import edu.stanford.nlp.util.StringUtils;
import java.util.function.Function;


/**
Expand All @@ -34,6 +35,7 @@ public class ColumnDocumentReaderAndWriter implements DocumentReaderAndWriter<Co
//map can be something like "word=0,tag=1,answer=2"
@SuppressWarnings("rawtypes")
private Class[] map; // = null;
private int wordColumn = -1;
private IteratorFromReaderFactory<List<CoreLabel>> factory;

// public void init(SeqClassifierFlags flags) {
Expand All @@ -51,6 +53,7 @@ public void init(SeqClassifierFlags flags) {
public void init(String map) {
// this.flags = null;
this.map = CoreLabel.parseStringKeys(StringUtils.mapStringToArray(map));
this.wordColumn = ArrayUtils.indexOf(this.map, CoreAnnotations.TextAnnotation.class);
factory = DelimitRegExIterator.getFactory("\n(?:\\s*\n)+", new ColumnDocParser());
}

Expand Down Expand Up @@ -87,6 +90,13 @@ public List<CoreLabel> apply(String doc) {
if (info.length == 1) {
info = whitePattern.split(line);
}
// Trimming later rather than splitting on all whitespace
// gives us the possibility of tokens with whitespace in them
// although obviously not at the start or end...
// doesn't slow the classifier down too much
if (wordColumn >= 0) {
info[wordColumn] = info[wordColumn].trim();
}
CoreLabel wi;
try {
wi = new CoreLabel(map, info);
Expand Down

0 comments on commit 0d9e9c8

Please sign in to comment.