redo handling of token indexes and tokenizing newlines

stanfordnlp · Feb 18, 2018 · d8c0c99 · d8c0c99
1 parent 3a30f2b
commit d8c0c99
Show file tree

Hide file tree

Showing 6 changed files with 82 additions and 107 deletions.
diff --git a/src/edu/stanford/nlp/ling/CoreLabel.java b/src/edu/stanford/nlp/ling/CoreLabel.java
@@ -29,7 +29,7 @@
  * @author dramage
  * @author rafferty
  */
-public class CoreLabel extends ArrayCoreMap implements AbstractCoreLabel, HasCategory /* , HasContext */  {
+public class CoreLabel extends ArrayCoreMap implements AbstractCoreLabel, HasCategory, HasContext  {
 
   private static final long serialVersionUID = 2L;
 
@@ -136,20 +136,6 @@ public CoreLabel(String[] keys, String[] values) {
     initFromStrings(keys, values);
   }
 
-  /**
-   * This constructor attempts uses preparsed Class keys.
-   * It's mainly useful for reading from a file.
-   *
-   * @param keys Array of key classes
-   * @param values Array of values (as String)
-   */
-  @SuppressWarnings("rawtypes")
-  public CoreLabel(Class[] keys, String[] values) {
-    super(keys.length);
-    //this.map = new ArrayCoreMap();
-    initFromStrings(keys, values);
-  }
-
   /** This is provided as a simple way to make a CoreLabel for a word from a String.
    *  It's often useful in fixup or test code. It sets all three of the Text, OriginalText,
    *  and Value annotations to the given value.
@@ -177,7 +163,7 @@ public interface GenericAnnotation<T> extends CoreAnnotation<T> {  }
   public static final Map<Class<? extends GenericAnnotation>, String> genericValues = Generics.newHashMap();
 
 
-  @SuppressWarnings({"unchecked", "rawtypes"})
+  @SuppressWarnings("unchecked")
   private void initFromStrings(String[] keys, String[] values) {
     if (keys.length != values.length) {
       throw new UnsupportedOperationException("Argument array lengths differ: " +
@@ -241,52 +227,6 @@ private void initFromStrings(String[] keys, String[] values) {
     }
   }
 
-  @SuppressWarnings("rawtypes")
-  public static Class[] parseStringKeys(String[] keys) {
-    Class[] classes = new Class[keys.length];
-    for (int i = 0; i < keys.length; i++) {
-      String key = keys[i];
-      classes[i] = AnnotationLookup.toCoreKey(key);
-
-      // now work with the key we got above
-      if (classes[i] == null) {
-        throw new UnsupportedOperationException("Unknown key " + key);
-      }
-    }
-    return classes;
-  }
-
-  @SuppressWarnings({"unchecked", "rawtypes"})
-  private void initFromStrings(Class[] keys, String[] values) {
-    if (keys.length != values.length) {
-      throw new UnsupportedOperationException("Argument array lengths differ: " +
-              Arrays.toString(keys) + " vs. " + Arrays.toString(values));
-    }
-    for (int i = 0; i < keys.length; i++) {
-      Class coreKeyClass = keys[i];
-      String value = values[i];
-      try {
-        Class<?> valueClass = AnnotationLookup.getValueType(coreKeyClass);
-        if (valueClass.equals(String.class)) {
-          this.set(coreKeyClass, values[i]);
-        } else if (valueClass == Integer.class) {
-          this.set(coreKeyClass, Integer.parseInt(values[i]));
-        } else if (valueClass == Double.class) {
-          this.set(coreKeyClass, Double.parseDouble(values[i]));
-        } else if (valueClass == Long.class) {
-          this.set(coreKeyClass, Long.parseLong(values[i]));
-        } else {
-          throw new RuntimeException("Can't handle " + valueClass);
-        }
-      } catch (Exception e) {
-        // unexpected value type
-        throw new UnsupportedOperationException("CORE: CoreLabel.initFromStrings: "
-            + "Bad type for " + coreKeyClass.getSimpleName()
-            + ". Value was: " + value
-            + "; expected "+AnnotationLookup.getValueType(coreKeyClass), e);
-      }
-    }
-  }
 
   private static class CoreLabelFactory implements LabelFactory {
 
@@ -789,6 +729,6 @@ public String toString(OutputFormat format) {
   }
 
   private static final Comparator<Class<?>> asClassComparator =
-          Comparator.comparing(Class::getName);
+          (o1, o2) -> o1.getName().compareTo(o2.getName());
 
 }
diff --git a/src/edu/stanford/nlp/pipeline/CleanXmlAnnotator.java b/src/edu/stanford/nlp/pipeline/CleanXmlAnnotator.java
@@ -351,6 +351,18 @@ private static void addAnnotationPatterns(CollectionValuedMap<Class, Pair<Patter
     }
   }
 
+  /**
+   * Helper method to set the TokenBeginAnnotation and TokenEndAnnotation of every token.
+   */
+  public void setTokenBeginTokenEnd(List<CoreLabel> tokensList) {
+    int tokenIndex = 0;
+    for (CoreLabel token : tokensList) {
+      token.set(CoreAnnotations.TokenBeginAnnotation.class, tokenIndex);
+      token.set(CoreAnnotations.TokenEndAnnotation.class, tokenIndex+1);
+      tokenIndex++;
+    }
+  }
+
   @Override
   public void annotate(Annotation annotation) {
     if (annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
@@ -359,14 +371,10 @@ public void annotate(Annotation annotation) {
       List<CoreLabel> newTokens = process(annotation, tokens);
       // We assume that if someone is using this annotator, they don't
       // want the old tokens any more and get rid of them
+      // redo the token indexes if xml tokens have been removed
+      setTokenBeginTokenEnd(newTokens);
       annotation.set(CoreAnnotations.TokensAnnotation.class, newTokens);
       if (DEBUG) { log.info("CleanXML: ending tokens: " + annotation.get(CoreAnnotations.TokensAnnotation.class)); }
-      // update token index annotation
-      int tokenIndex = 0;
-      for (CoreLabel token : newTokens) {
-        token.set(CoreAnnotations.TokenIndexAnnotation.class, tokenIndex);
-        tokenIndex++;
-      }
     }
   }
 

diff --git a/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java b/src/edu/stanford/nlp/pipeline/TokenizerAnnotator.java
@@ -294,10 +294,14 @@ public Tokenizer<CoreLabel> getTokenizer(Reader r) {
     return factory.getTokenizer(r);
   }
 
-  public void setTokenIndex(List<CoreLabel> tokens) {
+  /**
+   * Helper method to set the TokenBeginAnnotation and TokenEndAnnotation of every token.
+   */
+  public void setTokenBeginTokenEnd(List<CoreLabel> tokensList) {
     int tokenIndex = 0;
-    for (CoreLabel token : tokens) {
-      token.set(CoreAnnotations.TokenIndexAnnotation.class, tokenIndex);
+    for (CoreLabel token : tokensList) {
+      token.set(CoreAnnotations.TokenBeginAnnotation.class, tokenIndex);
+      token.set(CoreAnnotations.TokenEndAnnotation.class, tokenIndex+1);
       tokenIndex++;
     }
   }
@@ -315,7 +319,8 @@ public void annotate(Annotation annotation) {
     // for Arabic and Chinese use a segmenter instead
     if (useSegmenter) {
       segmenterAnnotator.annotate(annotation);
-      setTokenIndex(annotation.get(CoreAnnotations.TokensAnnotation.class));
+      // set indexes into document wide tokens list
+      setTokenBeginTokenEnd(annotation.get(CoreAnnotations.TokensAnnotation.class));
       return;
     }
 
@@ -338,8 +343,8 @@ public void annotate(Annotation annotation) {
           token.set(CoreAnnotations.IsNewlineAnnotation.class, false);
       }
 
-      setTokenIndex(tokens);
-
+      // set indexes into document wide token list
+      setTokenBeginTokenEnd(tokens);
       annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
       if (VERBOSE) {
         log.info("done.");
@@ -370,8 +375,7 @@ public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
         CoreAnnotations.IndexAnnotation.class,
         CoreAnnotations.OriginalTextAnnotation.class,
         CoreAnnotations.ValueAnnotation.class,
-        CoreAnnotations.IsNewlineAnnotation.class,
-        CoreAnnotations.TokenIndexAnnotation.class
+        CoreAnnotations.IsNewlineAnnotation.class
     ));
   }
 

diff --git a/src/edu/stanford/nlp/pipeline/WordsToSentencesAnnotator.java b/src/edu/stanford/nlp/pipeline/WordsToSentencesAnnotator.java
@@ -195,7 +195,6 @@ public void annotate(Annotation annotation) {
 
     String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
     // assemble the sentence annotations
-    // set the initial token offset to the first non-newline token index
     int lineNumber = 0;
     // section annotations to mark sentences with
     CoreMap sectionAnnotations = null;
@@ -226,11 +225,6 @@ public void annotate(Annotation annotation) {
       sentence.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, begin);
       sentence.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
       sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
-      // get index of first token of sentence and 1 + index of last token of sentence
-      int tokenBeginIndex = sentenceTokens.get(0).get(CoreAnnotations.TokenIndexAnnotation.class);
-      sentence.set(CoreAnnotations.TokenBeginAnnotation.class, tokenBeginIndex);
-      int tokenEndIndex = sentenceTokens.get(last).get(CoreAnnotations.TokenIndexAnnotation.class);
-      sentence.set(CoreAnnotations.TokenEndAnnotation.class, tokenEndIndex+1);
       sentence.set(CoreAnnotations.SentenceIndexAnnotation.class, sentences.size());
 
       if (countLineNumbers) {
@@ -310,13 +304,45 @@ public void annotate(Annotation annotation) {
       // add the sentence to the list
       sentences.add(sentence);
     }
-    // the condition below is possible if sentenceBoundaryToDiscard is initialized!
-      /*
-      if (tokenOffset != tokens.size()) {
-        throw new RuntimeException(String.format(
-            "expected %d tokens, found %d", tokens.size(), tokenOffset));
+
+    // after sentence splitting, remove newline tokens, set token and
+    // sentence indexes, and update before and after text appropriately
+    // at end of this annotator, it should be as though newline tokens
+    // were never used
+    // reset token indexes
+    List<CoreLabel> finalTokens = new ArrayList<CoreLabel>();
+    int tokenIndex = 0;
+    CoreLabel prevToken = null;
+    for (CoreLabel currToken : annotation.get(CoreAnnotations.TokensAnnotation.class)) {
+      if (!currToken.isNewline()) {
+        finalTokens.add(currToken);
+        currToken.set(CoreAnnotations.TokenBeginAnnotation.class, tokenIndex);
+        currToken.set(CoreAnnotations.TokenEndAnnotation.class, tokenIndex + 1);
+        tokenIndex++;
+        // fix before text for this token
+        if (prevToken != null && prevToken.isNewline()) {
+          String currTokenBeforeText = currToken.get(CoreAnnotations.BeforeAnnotation.class);
+          String prevTokenText = prevToken.get(CoreAnnotations.OriginalTextAnnotation.class);
+          currToken.set(CoreAnnotations.BeforeAnnotation.class, prevTokenText+currTokenBeforeText);
+        }
+        prevToken = currToken;
+      } else {
+        String newlineText = currToken.get(CoreAnnotations.OriginalTextAnnotation.class);
+        // fix after text for last token
+        String prevTokenAfterText = prevToken.get(CoreAnnotations.AfterAnnotation.class);
+        prevToken.set(CoreAnnotations.AfterAnnotation.class, prevTokenAfterText+newlineText);
+        prevToken = currToken;
       }
-      */
+    }
+    // set sentence token begin and token end values
+    for (CoreMap sentence : sentences) {
+      List<CoreLabel> sentenceTokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
+      int sentenceTokenBegin = sentenceTokens.get(0).get(CoreAnnotations.TokenBeginAnnotation.class);
+      int sentenceTokenEnd = sentenceTokens.get(sentenceTokens.size()-1).get(
+          CoreAnnotations.TokenEndAnnotation.class);
+      sentence.set(CoreAnnotations.TokenBeginAnnotation.class, sentenceTokenBegin);
+      sentence.set(CoreAnnotations.TokenEndAnnotation.class, sentenceTokenEnd);
+    }
 
     // add the sentences annotations to the document
     annotation.set(CoreAnnotations.SentencesAnnotation.class, sentences);
@@ -330,7 +356,7 @@ public Set<Class<? extends CoreAnnotation>> requires() {
         CoreAnnotations.TokensAnnotation.class,
         CoreAnnotations.CharacterOffsetBeginAnnotation.class,
         CoreAnnotations.CharacterOffsetEndAnnotation.class,
-        CoreAnnotations.TokenIndexAnnotation.class
+        CoreAnnotations.IsNewlineAnnotation.class
     )));
   }
 

diff --git a/src/edu/stanford/nlp/sequences/CoNLLDocumentReaderAndWriter.java b/src/edu/stanford/nlp/sequences/CoNLLDocumentReaderAndWriter.java
@@ -91,16 +91,16 @@ private static Iterator<String> splitIntoDocs(Reader r) {
       Collection<String> docs = new ArrayList<>();
       ObjectBank<String> ob = ObjectBank.getLineIterator(r);
       StringBuilder current = new StringBuilder();
-      Matcher matcher = docPattern.matcher("");
       for (String line : ob) {
-        if (matcher.reset(line).lookingAt()) {
+        if (docPattern.matcher(line).lookingAt()) {
           // Start new doc, store old one if non-empty
           if (current.length() > 0) {
             docs.add(current.toString());
-            current.setLength(0);
+            current = new StringBuilder();
           }
         }
-        current.append(line).append('\n');
+        current.append(line);
+        current.append('\n');
       }
       if (current.length() > 0) {
         docs.add(current.toString());
@@ -160,7 +160,7 @@ private CoreLabel makeCoreLabel(String line) {
         wi.setWord(bits[1]);
       } else {
         wi.setWord(bits[0]);
-      }
+        }
       wi.set(CoreAnnotations.LemmaAnnotation.class, bits[1]);
       wi.setTag(bits[2]);
       wi.set(CoreAnnotations.ChunkAnnotation.class, bits[3]);

diff --git a/src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java b/src/edu/stanford/nlp/sequences/ColumnDocumentReaderAndWriter.java
@@ -31,8 +31,7 @@ public class ColumnDocumentReaderAndWriter implements DocumentReaderAndWriter<Co
 
 //  private SeqClassifierFlags flags; // = null;
   //map can be something like "word=0,tag=1,answer=2"
-  @SuppressWarnings("rawtypes")
-  private Class[] map; // = null;
+  private String[] map; // = null;
   private IteratorFromReaderFactory<List<CoreLabel>> factory;
 
 //  public void init(SeqClassifierFlags flags) {
@@ -43,13 +42,14 @@ public class ColumnDocumentReaderAndWriter implements DocumentReaderAndWriter<Co
 
   @Override
   public void init(SeqClassifierFlags flags) {
-    init(flags.map);
+    this.map = StringUtils.mapStringToArray(flags.map);
+    factory = DelimitRegExIterator.getFactory("\n(?:\\s*\n)+", new ColumnDocParser());
   }
 
 
   public void init(String map) {
-    // this.flags = null;
-    this.map = CoreLabel.parseStringKeys(StringUtils.mapStringToArray(map));
+//    this.flags = null;
+    this.map = StringUtils.mapStringToArray(map);
     factory = DelimitRegExIterator.getFactory("\n(?:\\s*\n)+", new ColumnDocParser());
   }
 
@@ -66,7 +66,7 @@ private class ColumnDocParser implements Serializable, Function<String,List<Core
     private static final long serialVersionUID = -6266332661459630572L;
     private final Pattern whitePattern = Pattern.compile("\\s+"); // should this really only do a tab?
 
-    private int lineCount; // = 0;
+    private int lineCount = 0;
 
     @Override
     public List<CoreLabel> apply(String doc) {
@@ -81,11 +81,8 @@ public List<CoreLabel> apply(String doc) {
         if (line.trim().isEmpty()) {
           continue;
         }
-        // Optimistic splitting on tabs first. If that doesn't work, use any whitespace (slower, because of regexps).
-        String[] info = line.split("\t");
-        if (info.length == 1) {
-          info = whitePattern.split(line);
-        }
+        String[] info = whitePattern.split(line);
+        // todo: We could speed things up here by having one time only having converted map into an array of CoreLabel keys (Class<? extends CoreAnnotation<?>>) and then instantiating them. Need new constructor.
         CoreLabel wi;
         try {
           wi = new CoreLabel(map, info);
@@ -110,7 +107,7 @@ public void printAnswers(List<CoreLabel> doc, PrintWriter out) {
     for (CoreLabel wi : doc) {
       String answer = wi.get(CoreAnnotations.AnswerAnnotation.class);
       String goldAnswer = wi.get(CoreAnnotations.GoldAnswerAnnotation.class);
-      out.println(wi.word() + '\t' + goldAnswer + '\t' + answer);
+      out.println(wi.word() + "\t" + goldAnswer + "\t" + answer);
     }
     out.println();
   }