Skip to content

Commit

Permalink
redo handling of token indexes and tokenizing newlines
Browse files Browse the repository at this point in the history
  • Loading branch information
J38 authored and Stanford NLP committed Feb 18, 2018
1 parent 3a30f2b commit d8c0c99
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 107 deletions.
66 changes: 3 additions & 63 deletions src/edu/stanford/nlp/ling/CoreLabel.java
Expand Up @@ -29,7 +29,7 @@
* @author dramage
* @author rafferty
*/
public class CoreLabel extends ArrayCoreMap implements AbstractCoreLabel, HasCategory /* , HasContext */ {
public class CoreLabel extends ArrayCoreMap implements AbstractCoreLabel, HasCategory, HasContext {

private static final long serialVersionUID = 2L;

Expand Down Expand Up @@ -136,20 +136,6 @@ public CoreLabel(String[] keys, String[] values) {
initFromStrings(keys, values);
}

/**
* This constructor attempts uses preparsed Class keys.
* It's mainly useful for reading from a file.
*
* @param keys Array of key classes
* @param values Array of values (as String)
*/
@SuppressWarnings("rawtypes")
public CoreLabel(Class[] keys, String[] values) {
super(keys.length);
//this.map = new ArrayCoreMap();
initFromStrings(keys, values);
}

/** This is provided as a simple way to make a CoreLabel for a word from a String.
* It's often useful in fixup or test code. It sets all three of the Text, OriginalText,
* and Value annotations to the given value.
Expand Down Expand Up @@ -177,7 +163,7 @@ public interface GenericAnnotation<T> extends CoreAnnotation<T> { }
public static final Map<Class<? extends GenericAnnotation>, String> genericValues = Generics.newHashMap();


@SuppressWarnings({"unchecked", "rawtypes"})
@SuppressWarnings("unchecked")
private void initFromStrings(String[] keys, String[] values) {
if (keys.length != values.length) {
throw new UnsupportedOperationException("Argument array lengths differ: " +
Expand Down Expand Up @@ -241,52 +227,6 @@ private void initFromStrings(String[] keys, String[] values) {
}
}

@SuppressWarnings("rawtypes")
public static Class[] parseStringKeys(String[] keys) {
Class[] classes = new Class[keys.length];
for (int i = 0; i < keys.length; i++) {
String key = keys[i];
classes[i] = AnnotationLookup.toCoreKey(key);

// now work with the key we got above
if (classes[i] == null) {
throw new UnsupportedOperationException("Unknown key " + key);
}
}
return classes;
}

@SuppressWarnings({"unchecked", "rawtypes"})
private void initFromStrings(Class[] keys, String[] values) {
if (keys.length != values.length) {
throw new UnsupportedOperationException("Argument array lengths differ: " +
Arrays.toString(keys) + " vs. " + Arrays.toString(values));
}
for (int i = 0; i < keys.length; i++) {
Class coreKeyClass = keys[i];
String value = values[i];
try {
Class<?> valueClass = AnnotationLookup.getValueType(coreKeyClass);
if (valueClass.equals(String.class)) {
this.set(coreKeyClass, values[i]);
} else if (valueClass == Integer.class) {
this.set(coreKeyClass, Integer.parseInt(values[i]));
} else if (valueClass == Double.class) {
this.set(coreKeyClass, Double.parseDouble(values[i]));
} else if (valueClass == Long.class) {
this.set(coreKeyClass, Long.parseLong(values[i]));
} else {
throw new RuntimeException("Can't handle " + valueClass);
}
} catch (Exception e) {
// unexpected value type
throw new UnsupportedOperationException("CORE: CoreLabel.initFromStrings: "
+ "Bad type for " + coreKeyClass.getSimpleName()
+ ". Value was: " + value
+ "; expected "+AnnotationLookup.getValueType(coreKeyClass), e);
}
}
}

private static class CoreLabelFactory implements LabelFactory {

Expand Down Expand Up @@ -789,6 +729,6 @@ public String toString(OutputFormat format) {
}

private static final Comparator<Class<?>> asClassComparator =
Comparator.comparing(Class::getName);
(o1, o2) -> o1.getName().compareTo(o2.getName());

}
20 changes: 14 additions & 6 deletions src/edu/stanford/nlp/pipeline/CleanXmlAnnotator.java
Expand Up @@ -351,6 +351,18 @@ private static void addAnnotationPatterns(CollectionValuedMap<Class, Pair<Patter
}
}

/**
* Helper method to set the TokenBeginAnnotation and TokenEndAnnotation of every token.
*/
public void setTokenBeginTokenEnd(List<CoreLabel> tokensList) {
int tokenIndex = 0;
for (CoreLabel token : tokensList) {
token.set(CoreAnnotations.TokenBeginAnnotation.class, tokenIndex);
token.set(CoreAnnotations.TokenEndAnnotation.class, tokenIndex+1);
tokenIndex++;
}
}

@Override
public void annotate(Annotation annotation) {
if (annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
Expand All @@ -359,14 +371,10 @@ public void annotate(Annotation annotation) {
List<CoreLabel> newTokens = process(annotation, tokens);
// We assume that if someone is using this annotator, they don't
// want the old tokens any more and get rid of them
// redo the token indexes if xml tokens have been removed
setTokenBeginTokenEnd(newTokens);
annotation.set(CoreAnnotations.TokensAnnotation.class, newTokens);
if (DEBUG) { log.info("CleanXML: ending tokens: " + annotation.get(CoreAnnotations.TokensAnnotation.class)); }
// update token index annotation
int tokenIndex = 0;
for (CoreLabel token : newTokens) {
token.set(CoreAnnotations.TokenIndexAnnotation.class, tokenIndex);
tokenIndex++;
}
}
}

Expand Down
20 changes: 12 additions & 8 deletions src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java
Expand Up @@ -294,10 +294,14 @@ public Tokenizer<CoreLabel> getTokenizer(Reader r) {
return factory.getTokenizer(r);
}

public void setTokenIndex(List<CoreLabel> tokens) {
/**
* Helper method to set the TokenBeginAnnotation and TokenEndAnnotation of every token.
*/
public void setTokenBeginTokenEnd(List<CoreLabel> tokensList) {
int tokenIndex = 0;
for (CoreLabel token : tokens) {
token.set(CoreAnnotations.TokenIndexAnnotation.class, tokenIndex);
for (CoreLabel token : tokensList) {
token.set(CoreAnnotations.TokenBeginAnnotation.class, tokenIndex);
token.set(CoreAnnotations.TokenEndAnnotation.class, tokenIndex+1);
tokenIndex++;
}
}
Expand All @@ -315,7 +319,8 @@ public void annotate(Annotation annotation) {
// for Arabic and Chinese use a segmenter instead
if (useSegmenter) {
segmenterAnnotator.annotate(annotation);
setTokenIndex(annotation.get(CoreAnnotations.TokensAnnotation.class));
// set indexes into document wide tokens list
setTokenBeginTokenEnd(annotation.get(CoreAnnotations.TokensAnnotation.class));
return;
}

Expand All @@ -338,8 +343,8 @@ public void annotate(Annotation annotation) {
token.set(CoreAnnotations.IsNewlineAnnotation.class, false);
}

setTokenIndex(tokens);

// set indexes into document wide token list
setTokenBeginTokenEnd(tokens);
annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
if (VERBOSE) {
log.info("done.");
Expand Down Expand Up @@ -370,8 +375,7 @@ public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
CoreAnnotations.IndexAnnotation.class,
CoreAnnotations.OriginalTextAnnotation.class,
CoreAnnotations.ValueAnnotation.class,
CoreAnnotations.IsNewlineAnnotation.class,
CoreAnnotations.TokenIndexAnnotation.class
CoreAnnotations.IsNewlineAnnotation.class
));
}

Expand Down
52 changes: 39 additions & 13 deletions src/edu/stanford/nlp/pipeline/WordsToSentencesAnnotator.java
Expand Up @@ -195,7 +195,6 @@ public void annotate(Annotation annotation) {

String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
// assemble the sentence annotations
// set the initial token offset to the first non-newline token index
int lineNumber = 0;
// section annotations to mark sentences with
CoreMap sectionAnnotations = null;
Expand Down Expand Up @@ -226,11 +225,6 @@ public void annotate(Annotation annotation) {
sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
// get index of first token of sentence and 1 + index of last token of sentence
int tokenBeginIndex = sentenceTokens.get(0).get(CoreAnnotations.TokenIndexAnnotation.class);
sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenBeginIndex);
int tokenEndIndex = sentenceTokens.get(last).get(CoreAnnotations.TokenIndexAnnotation.class);
sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenEndIndex+1);
sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());

if (countLineNumbers) {
Expand Down Expand Up @@ -310,13 +304,45 @@ public void annotate(Annotation annotation) {
// add the sentence to the list
sentences.add(sentence);
}
// the condition below is possible if sentenceBoundaryToDiscard is initialized!
/*
if (tokenOffset != tokens.size()) {
throw new RuntimeException(String.format(
"expected %d tokens, found %d", tokens.size(), tokenOffset));

// after sentence splitting, remove newline tokens, set token and
// sentence indexes, and update before and after text appropriately
// at end of this annotator, it should be as though newline tokens
// were never used
// reset token indexes
List<CoreLabel> finalTokens = new ArrayList<CoreLabel>();
int tokenIndex = 0;
CoreLabel prevToken = null;
for (CoreLabel currToken : annotation.get(CoreAnnotations.TokensAnnotation.class)) {
if (!currToken.isNewline()) {
finalTokens.add(currToken);
currToken.set(CoreAnnotations.TokenBeginAnnotation.class, tokenIndex);
currToken.set(CoreAnnotations.TokenEndAnnotation.class, tokenIndex + 1);
tokenIndex++;
// fix before text for this token
if (prevToken != null && prevToken.isNewline()) {
String currTokenBeforeText = currToken.get(CoreAnnotations.BeforeAnnotation.class);
String prevTokenText = prevToken.get(CoreAnnotations.OriginalTextAnnotation.class);
currToken.set(CoreAnnotations.BeforeAnnotation.class, prevTokenText+currTokenBeforeText);
}
prevToken = currToken;
} else {
String newlineText = currToken.get(CoreAnnotations.OriginalTextAnnotation.class);
// fix after text for last token
String prevTokenAfterText = prevToken.get(CoreAnnotations.AfterAnnotation.class);
prevToken.set(CoreAnnotations.AfterAnnotation.class, prevTokenAfterText+newlineText);
prevToken = currToken;
}
*/
}
// set sentence token begin and token end values
for (CoreMap sentence : sentences) {
List<CoreLabel> sentenceTokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
int sentenceTokenBegin = sentenceTokens.get(0).get(CoreAnnotations.TokenBeginAnnotation.class);
int sentenceTokenEnd = sentenceTokens.get(sentenceTokens.size()-1).get(
CoreAnnotations.TokenEndAnnotation.class);
sentence.set(CoreAnnotations.TokenBeginAnnotation.class, sentenceTokenBegin);
sentence.set(CoreAnnotations.TokenEndAnnotation.class, sentenceTokenEnd);
}

// add the sentences annotations to the document
annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
Expand All @@ -330,7 +356,7 @@ public Set<Class<? extends CoreAnnotation>> requires() {
CoreAnnotations.TokensAnnotation.class,
CoreAnnotations.CharacterOffsetBeginAnnotation.class,
CoreAnnotations.CharacterOffsetEndAnnotation.class,
CoreAnnotations.TokenIndexAnnotation.class
CoreAnnotations.IsNewlineAnnotation.class
)));
}

Expand Down
10 changes: 5 additions & 5 deletions src/edu/stanford/nlp/sequences/CoNLLDocumentReaderAndWriter.java
Expand Up @@ -91,16 +91,16 @@ private static Iterator<String> splitIntoDocs(Reader r) {
Collection<String> docs = new ArrayList<>();
ObjectBank<String> ob = ObjectBank.getLineIterator(r);
StringBuilder current = new StringBuilder();
Matcher matcher = docPattern.matcher("");
for (String line : ob) {
if (matcher.reset(line).lookingAt()) {
if (docPattern.matcher(line).lookingAt()) {
// Start new doc, store old one if non-empty
if (current.length() > 0) {
docs.add(current.toString());
current.setLength(0);
current = new StringBuilder();
}
}
current.append(line).append('\n');
current.append(line);
current.append('\n');
}
if (current.length() > 0) {
docs.add(current.toString());
Expand Down Expand Up @@ -160,7 +160,7 @@ private CoreLabel makeCoreLabel(String line) {
wi.setWord(bits[1]);
} else {
wi.setWord(bits[0]);
}
}
wi.set(CoreAnnotations.LemmaAnnotation.class, bits[1]);
wi.setTag(bits[2]);
wi.set(CoreAnnotations.ChunkAnnotation.class, bits[3]);
Expand Down
21 changes: 9 additions & 12 deletions src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java
Expand Up @@ -31,8 +31,7 @@ public class ColumnDocumentReaderAndWriter implements DocumentReaderAndWriter<Co

// private SeqClassifierFlags flags; // = null;
//map can be something like "word=0,tag=1,answer=2"
@SuppressWarnings("rawtypes")
private Class[] map; // = null;
private String[] map; // = null;
private IteratorFromReaderFactory<List<CoreLabel>> factory;

// public void init(SeqClassifierFlags flags) {
Expand All @@ -43,13 +42,14 @@ public class ColumnDocumentReaderAndWriter implements DocumentReaderAndWriter<Co

@Override
public void init(SeqClassifierFlags flags) {
init(flags.map);
this.map = StringUtils.mapStringToArray(flags.map);
factory = DelimitRegExIterator.getFactory("\n(?:\\s*\n)+", new ColumnDocParser());
}


public void init(String map) {
// this.flags = null;
this.map = CoreLabel.parseStringKeys(StringUtils.mapStringToArray(map));
// this.flags = null;
this.map = StringUtils.mapStringToArray(map);
factory = DelimitRegExIterator.getFactory("\n(?:\\s*\n)+", new ColumnDocParser());
}

Expand All @@ -66,7 +66,7 @@ private class ColumnDocParser implements Serializable, Function<String,List<Core
private static final long serialVersionUID = -6266332661459630572L;
private final Pattern whitePattern = Pattern.compile("\\s+"); // should this really only do a tab?

private int lineCount; // = 0;
private int lineCount = 0;

@Override
public List<CoreLabel> apply(String doc) {
Expand All @@ -81,11 +81,8 @@ public List<CoreLabel> apply(String doc) {
if (line.trim().isEmpty()) {
continue;
}
// Optimistic splitting on tabs first. If that doesn't work, use any whitespace (slower, because of regexps).
String[] info = line.split("\t");
if (info.length == 1) {
info = whitePattern.split(line);
}
String[] info = whitePattern.split(line);
// todo: We could speed things up here by having one time only having converted map into an array of CoreLabel keys (Class<? extends CoreAnnotation<?>>) and then instantiating them. Need new constructor.
CoreLabel wi;
try {
wi = new CoreLabel(map, info);
Expand All @@ -110,7 +107,7 @@ public void printAnswers(List<CoreLabel> doc, PrintWriter out) {
for (CoreLabel wi : doc) {
String answer = wi.get(CoreAnnotations.AnswerAnnotation.class);
String goldAnswer = wi.get(CoreAnnotations.GoldAnswerAnnotation.class);
out.println(wi.word() + '\t' + goldAnswer + '\t' + answer);
out.println(wi.word() + "\t" + goldAnswer + "\t" + answer);
}
out.println();
}
Expand Down

0 comments on commit d8c0c99

Please sign in to comment.