Update javadoc, remove warnings, fix code indentation, make ChineseSe…

…gmenterAnnotator less noisy when it runs.
stanfordnlp · Apr 10, 2015 · c5596dd · c5596dd
1 parent b9259d9
commit c5596dd
Show file tree

Hide file tree

Showing 4 changed files with 151 additions and 167 deletions.
diff --git a/src/edu/stanford/nlp/pipeline/Annotator.java b/src/edu/stanford/nlp/pipeline/Annotator.java
@@ -6,17 +6,17 @@
 import java.util.Set;
 
 /**
- * This is an interface for adding annotations to a fully annotated
+ * This is an interface for adding annotations to a partially annotated
- * Annotation.  In some ways, it is just a glorified Function, except
+ * Annotation.  In some ways, it is just a glorified function, except
- * that it explicitly operates on Annotation objects.  Annotators
+ * that it explicitly operates in-place on Annotation objects.  Annotators
  * should be given to an AnnotationPipeline in order to make
  * annotation pipelines (the whole motivation of this package), and
  * therefore implementers of this interface should be designed to play
  * well with other Annotators and in their javadocs they should
  * explicitly state what annotations they are assuming already exist
- * in the annotation (like parse, POS tag, etc), what field they are
+ * in the annotation (like parse, POS tag, etc), what keys they are
- * expecting them under (Annotation.WORDS_KEY, Annotation.PARSE_KEY,
+ * expecting them under (see, for instance, the ones in CoreAnnotations),
- * etc) and what annotations they will add (or modify) and the keys
+ * and what annotations they will add (or modify) and the keys
  * for them as well.  If you would like to look at the code for a
  * relatively simple Annotator, I recommend NERAnnotator.  For a lot
  * of code you could just add the implements directly, but I recommend
@@ -40,7 +40,7 @@ public interface Annotator {
   /**
    * Given an Annotation, perform a task on this Annotation.
    */
-  public void annotate(Annotation annotation) ;
+  void annotate(Annotation annotation);
 
   /**
    * The Requirement is a general way of describing the pre and post
@@ -55,14 +55,14 @@ public interface Annotator {
    * <br>
    * We do nothing to override the equals or hashCode methods.  This
    * means that two Requirements are equal iff they are the same
-   * object.  We do not want to use <code>name</code> to decide
+   * object.  We do not want to use {@code name} to decide
    * equality because a subclass that uses more information, such as
    * the particular kind of tsurgeon used in a hypothetical
    * TsurgeonAnnotator, cannot use a stricter equals() than the
    * superclass.  It is hard to get stricter than ==.
    */
-  public class Requirement {
+  class Requirement {
-    public final String name;
+    private final String name;
     public Requirement(String name) {
       this.name = name;
     }
@@ -76,57 +76,57 @@ public String toString() {
    * Returns a set of requirements for which tasks this annotator can
    * provide.  For example, the POS annotator will return "pos".
    */
-  public Set<Requirement> requirementsSatisfied();
+  Set<Requirement> requirementsSatisfied();
 
   /**
    * Returns the set of tasks which this annotator requires in order
    * to perform.  For example, the POS annotator will return
    * "tokenize", "ssplit".
    */
-  public Set<Requirement> requires();
+  Set<Requirement> requires();
 
   /**
    * These are annotators which StanfordCoreNLP knows how to create.
    * Add new annotators and/or annotators from other groups here!
    */
-  public static final String STANFORD_TOKENIZE = "tokenize";
+  String STANFORD_TOKENIZE = "tokenize";
-  public static final String STANFORD_CLEAN_XML = "cleanxml";
+  String STANFORD_CLEAN_XML = "cleanxml";
-  public static final String STANFORD_SSPLIT = "ssplit";
+  String STANFORD_SSPLIT = "ssplit";
-  public static final String STANFORD_POS = "pos";
+  String STANFORD_POS = "pos";
-  public static final String STANFORD_LEMMA = "lemma";
+  String STANFORD_LEMMA = "lemma";
-  public static final String STANFORD_NER = "ner";
+  String STANFORD_NER = "ner";
-  public static final String STANFORD_REGEXNER = "regexner";
+  String STANFORD_REGEXNER = "regexner";
-  public static final String STANFORD_ENTITY_MENTIONS = "entitymentions";
+  String STANFORD_ENTITY_MENTIONS = "entitymentions";
-  public static final String STANFORD_GENDER = "gender";
+  String STANFORD_GENDER = "gender";
-  public static final String STANFORD_TRUECASE = "truecase";
+  String STANFORD_TRUECASE = "truecase";
-  public static final String STANFORD_PARSE = "parse";
+  String STANFORD_PARSE = "parse";
-  public static final String STANFORD_DETERMINISTIC_COREF = "dcoref";
+  String STANFORD_DETERMINISTIC_COREF = "dcoref";
-  public static final String STANFORD_COREF = "hcoref";
+  String STANFORD_COREF = "hcoref";
-  public static final String STANFORD_RELATION = "relation";
+  String STANFORD_RELATION = "relation";
-  public static final String STANFORD_SENTIMENT = "sentiment";
+  String STANFORD_SENTIMENT = "sentiment";
-  public static final String STANFORD_COLUMN_DATA_CLASSIFIER = "cdc";
+  String STANFORD_COLUMN_DATA_CLASSIFIER = "cdc";
-  public static final String STANFORD_DEPENDENCIES = "depparse";
+  String STANFORD_DEPENDENCIES = "depparse";
-  public static final String STANFORD_NATLOG = "natlog";
+  String STANFORD_NATLOG = "natlog";
-  public static final String STANFORD_OPENIE = "openie";
+  String STANFORD_OPENIE = "openie";
-  public static final String STANFORD_QUOTE = "quote";
+  String STANFORD_QUOTE = "quote";
 
 
-  public static final Requirement TOKENIZE_REQUIREMENT = new Requirement(STANFORD_TOKENIZE);
+  Requirement TOKENIZE_REQUIREMENT = new Requirement(STANFORD_TOKENIZE);
-  public static final Requirement CLEAN_XML_REQUIREMENT = new Requirement(STANFORD_CLEAN_XML);
+  Requirement CLEAN_XML_REQUIREMENT = new Requirement(STANFORD_CLEAN_XML);
-  public static final Requirement SSPLIT_REQUIREMENT = new Requirement(STANFORD_SSPLIT);
+  Requirement SSPLIT_REQUIREMENT = new Requirement(STANFORD_SSPLIT);
-  public static final Requirement POS_REQUIREMENT = new Requirement(STANFORD_POS);
+  Requirement POS_REQUIREMENT = new Requirement(STANFORD_POS);
-  public static final Requirement LEMMA_REQUIREMENT = new Requirement(STANFORD_LEMMA);
+  Requirement LEMMA_REQUIREMENT = new Requirement(STANFORD_LEMMA);
-  public static final Requirement NER_REQUIREMENT = new Requirement(STANFORD_NER);
+  Requirement NER_REQUIREMENT = new Requirement(STANFORD_NER);
-  public static final Requirement GENDER_REQUIREMENT = new Requirement(STANFORD_GENDER);
+  Requirement GENDER_REQUIREMENT = new Requirement(STANFORD_GENDER);
-  public static final Requirement TRUECASE_REQUIREMENT = new Requirement(STANFORD_TRUECASE);
+  Requirement TRUECASE_REQUIREMENT = new Requirement(STANFORD_TRUECASE);
-  public static final Requirement PARSE_REQUIREMENT = new Requirement(STANFORD_PARSE);
+  Requirement PARSE_REQUIREMENT = new Requirement(STANFORD_PARSE);
-  public static final Requirement DEPENDENCY_REQUIREMENT = new Requirement(STANFORD_DEPENDENCIES);
+  Requirement DEPENDENCY_REQUIREMENT = new Requirement(STANFORD_DEPENDENCIES);
-  public static final Requirement DETERMINISTIC_COREF_REQUIREMENT = new Requirement(STANFORD_DETERMINISTIC_COREF);
+  Requirement DETERMINISTIC_COREF_REQUIREMENT = new Requirement(STANFORD_DETERMINISTIC_COREF);
-  public static final Requirement COREF_REQUIREMENT = new Requirement(STANFORD_COREF);
+  Requirement COREF_REQUIREMENT = new Requirement(STANFORD_COREF);
-  public static final Requirement RELATION_EXTRACTOR_REQUIREMENT = new Requirement(STANFORD_RELATION);
+  Requirement RELATION_EXTRACTOR_REQUIREMENT = new Requirement(STANFORD_RELATION);
-  public static final Requirement NATLOG_REQUIREMENT = new Requirement(STANFORD_NATLOG);
+  Requirement NATLOG_REQUIREMENT = new Requirement(STANFORD_NATLOG);
-  public static final Requirement OPENIE_REQUIREMENT = new Requirement(STANFORD_OPENIE);
+  Requirement OPENIE_REQUIREMENT = new Requirement(STANFORD_OPENIE);
-  public static final Requirement QUOTE_REQUIREMENT = new Requirement(STANFORD_QUOTE);
+  Requirement QUOTE_REQUIREMENT = new Requirement(STANFORD_QUOTE);
 
   /**
    * These are annotators which StanfordCoreNLP does not know how to
@@ -135,30 +135,31 @@ public String toString() {
    * already included in other parts of the system, such as sutime,
    * which is already included in ner.
    */
-  public static final Requirement GUTIME_REQUIREMENT = new Requirement("gutime");
+  Requirement GUTIME_REQUIREMENT = new Requirement("gutime");
-  public static final Requirement SUTIME_REQUIREMENT = new Requirement("sutime");
+  Requirement SUTIME_REQUIREMENT = new Requirement("sutime");
-  public static final Requirement HEIDELTIME_REQUIREMENT = new Requirement("heideltime");
+  Requirement HEIDELTIME_REQUIREMENT = new Requirement("heideltime");
-  public static final Requirement STEM_REQUIREMENT = new Requirement("stem");
+  Requirement STEM_REQUIREMENT = new Requirement("stem");
-  public static final Requirement NUMBER_REQUIREMENT = new Requirement("number");
+  Requirement NUMBER_REQUIREMENT = new Requirement("number");
-  public static final Requirement TIME_WORDS_REQUIREMENT = new Requirement("timewords");
+  Requirement TIME_WORDS_REQUIREMENT = new Requirement("timewords");
-  public static final Requirement QUANTIFIABLE_ENTITY_NORMALIZATION_REQUIREMENT = new Requirement("quantifiable_entity_normalization");
+  Requirement QUANTIFIABLE_ENTITY_NORMALIZATION_REQUIREMENT = new Requirement("quantifiable_entity_normalization");
-  public static final Requirement COLUMN_DATA_CLASSIFIER = new Requirement("column_data_classifer");
+  Requirement COLUMN_DATA_CLASSIFIER = new Requirement("column_data_classifer");
 
   /**
-   * The Stanford Parser can produce this if it is specifically requested
+   * The Stanford Parser can produce this if it is specifically requested.
    */
-  public static final Requirement BINARIZED_TREES_REQUIREMENT = new Requirement("binarized_trees");
+  Requirement BINARIZED_TREES_REQUIREMENT = new Requirement("binarized_trees");
 
   /**
    * These are typical combinations of annotators which may be used as
    * requirements by other annotators.
    */
-  public static final Set<Requirement> TOKENIZE_AND_SSPLIT = Collections.unmodifiableSet(new ArraySet<Requirement>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT));
+  Set<Requirement> TOKENIZE_AND_SSPLIT = Collections.unmodifiableSet(new ArraySet<>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT));
-  public static final Set<Requirement> TOKENIZE_SSPLIT_POS = Collections.unmodifiableSet(new ArraySet<Requirement>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, POS_REQUIREMENT));
+  Set<Requirement> TOKENIZE_SSPLIT_POS = Collections.unmodifiableSet(new ArraySet<>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, POS_REQUIREMENT));
-  public static final Set<Requirement> TOKENIZE_SSPLIT_NER = Collections.unmodifiableSet(new ArraySet<Requirement>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, NER_REQUIREMENT));
+  Set<Requirement> TOKENIZE_SSPLIT_NER = Collections.unmodifiableSet(new ArraySet<>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, NER_REQUIREMENT));
-  public static final Set<Requirement> TOKENIZE_SSPLIT_PARSE = Collections.unmodifiableSet(new ArraySet<Requirement>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, PARSE_REQUIREMENT));
+  Set<Requirement> TOKENIZE_SSPLIT_PARSE = Collections.unmodifiableSet(new ArraySet<>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, PARSE_REQUIREMENT));
-  public static final Set<Requirement> TOKENIZE_SSPLIT_PARSE_NER = Collections.unmodifiableSet(new ArraySet<Requirement>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, PARSE_REQUIREMENT, NER_REQUIREMENT));
+  Set<Requirement> TOKENIZE_SSPLIT_PARSE_NER = Collections.unmodifiableSet(new ArraySet<>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, PARSE_REQUIREMENT, NER_REQUIREMENT));
-  public static final Set<Requirement> TOKENIZE_SSPLIT_POS_LEMMA = Collections.unmodifiableSet(new ArraySet<Requirement>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, POS_REQUIREMENT, LEMMA_REQUIREMENT));
+  Set<Requirement> TOKENIZE_SSPLIT_POS_LEMMA = Collections.unmodifiableSet(new ArraySet<>(TOKENIZE_REQUIREMENT, SSPLIT_REQUIREMENT, POS_REQUIREMENT, LEMMA_REQUIREMENT));
-  public static final Set<Requirement> PARSE_AND_TAG = Collections.unmodifiableSet(new ArraySet<Requirement>(POS_REQUIREMENT, PARSE_REQUIREMENT));
+  Set<Requirement> PARSE_AND_TAG = Collections.unmodifiableSet(new ArraySet<>(POS_REQUIREMENT, PARSE_REQUIREMENT));
-  public static final Set<Requirement> PARSE_TAG_BINARIZED_TREES = Collections.unmodifiableSet(new ArraySet<Requirement>(POS_REQUIREMENT, PARSE_REQUIREMENT, BINARIZED_TREES_REQUIREMENT));
+  Set<Requirement> PARSE_TAG_BINARIZED_TREES = Collections.unmodifiableSet(new ArraySet<>(POS_REQUIREMENT, PARSE_REQUIREMENT, BINARIZED_TREES_REQUIREMENT));
+
 }
diff --git a/src/edu/stanford/nlp/pipeline/ChineseSegmenterAnnotator.java b/src/edu/stanford/nlp/pipeline/ChineseSegmenterAnnotator.java
@@ -13,26 +13,23 @@
 import edu.stanford.nlp.ling.CoreAnnotations;
 import edu.stanford.nlp.util.CoreMap;
 import edu.stanford.nlp.util.PropertiesUtils;
-import edu.stanford.nlp.util.Timing;
 
 /**
- * This class will add Segmentation information to an
+ * This class will add segmentation information to an Annotation.
- * Annotation.  
+ * It assumes that the original document is a List of sentences under the
- * It assumes that the original String or List<String> is under the Annotation.ORIG_STRING_KEY 
+ * SentencesAnnotation.class key, and that each sentence has a
- * and also corresponding character level information is under Annotation.WORDS_KEY
+ * TextAnnotation.class key. This Annotator adds corresponding
- * and addes segmentation information to each CoreLabel,
+ * information under a CharactersAnnotation.class key prior to segmentation,
- * in the CoreLabel.CH_SEG_KEY field.
+ * and a TokensAnnotation.class key with value of a List of CoreLabel
+ * after segmentation.
  *
  * @author Pi-Chuan Chang
  */
 public class ChineseSegmenterAnnotator implements Annotator {
 
-  private AbstractSequenceClassifier<?> segmenter = null;
+  private AbstractSequenceClassifier<?> segmenter;
+  private final boolean VERBOSE;
 
-  private Timing timer = new Timing();
-  private static long millisecondsAnnotating = 0;
-  private boolean VERBOSE = false;
-
   private static final String DEFAULT_SEG_LOC =
     "/u/nlp/data/gale/segtool/stanford-seg/classifiers-2010/05202008-ctb6.processed-chris6.lex.gz";
 
@@ -49,7 +46,7 @@ public ChineseSegmenterAnnotator() {
   public ChineseSegmenterAnnotator(boolean verbose) {
     this(DEFAULT_SEG_LOC, verbose);
   }
-  
+
   public ChineseSegmenterAnnotator(String segLoc, boolean verbose) {
     this(segLoc, verbose, DEFAULT_SER_DICTIONARY, DEFAULT_SIGHAN_CORPORA_DICT);
   }
@@ -66,10 +63,11 @@ public ChineseSegmenterAnnotator(String name, Properties props) {
     String model = null;
     // Keep only the properties that apply to this annotator
     Properties modelProps = new Properties();
+    String desiredKey = name + '.';
     for (String key : props.stringPropertyNames()) {
-      if (key.startsWith(name + ".")) {
+      if (key.startsWith(desiredKey)) {
         // skip past name and the subsequent "."
-        String modelKey = key.substring(name.length() + 1);
+        String modelKey = key.substring(desiredKey.length());
         if (modelKey.equals("model")) {
           model = props.getProperty(key);
         } else {
@@ -83,22 +81,20 @@ public ChineseSegmenterAnnotator(String name, Properties props) {
     }
     loadModel(model, modelProps);
   }
-
+
+  @SuppressWarnings("unused")
   private void loadModel(String segLoc) {
-    if (VERBOSE) {    
+    // don't write very much, because the CRFClassifier already reports loading
-      timer.start();
+    if (VERBOSE) {
-      System.err.print("Loading Segmentation Model ["+segLoc+"]...");
+      System.err.print("Loading segmentation model ... ");
     }
     segmenter = CRFClassifier.getClassifierNoExceptions(segLoc);
-    if (VERBOSE) {    
-      timer.stop("done.");
-    }
   }
-  
+
   private void loadModel(String segLoc, Properties props) {
+    // don't write very much, because the CRFClassifier already reports loading
     if (VERBOSE) {
-      timer.start();
+      System.err.print("Loading Segmentation Model ... ");
-      System.err.print("Loading Segmentation Model ["+segLoc+"]...");
     }
     try {
       segmenter = CRFClassifier.getClassifier(segLoc, props);
@@ -107,15 +103,12 @@ private void loadModel(String segLoc, Properties props) {
     } catch (Exception e) {
       throw new RuntimeException(e);
     }
-    if (VERBOSE) {    
-      timer.stop("done.");
-    }
   }
-
+
+  @Override
   public void annotate(Annotation annotation) {
-    if (VERBOSE) {    
+    if (VERBOSE) {
-      timer.start();
+      System.err.print("Adding Segmentation annotation ... ");
-      System.err.print("Adding Segmentation annotation...");
     }
     List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
     if (sentences != null) {
@@ -125,22 +118,18 @@ public void annotate(Annotation annotation) {
     } else {
       doOneSentence(annotation);
     }
-    if (VERBOSE) {    
-      millisecondsAnnotating += timer.stop("done.");
-      //System.err.println("output: "+l+"\n"); 
-    }    
   }
 
-  public void doOneSentence(CoreMap annotation) {
+  private void doOneSentence(CoreMap annotation) {
     splitCharacters(annotation);
     runSegmentation(annotation);
   }
 
-  public void splitCharacters(CoreMap annotation) {
+  private static void splitCharacters(CoreMap annotation) {
     String origText = annotation.get(CoreAnnotations.TextAnnotation.class);
-    
+
     boolean seg = true;
-    List<CoreLabel> words = new ArrayList<CoreLabel>();
+    List<CoreLabel> words = new ArrayList<>();
 
     for (int i = 0; i < origText.length(); i++) {
       CoreLabel wi = new CoreLabel();
@@ -150,7 +139,6 @@ public void splitCharacters(CoreMap annotation) {
       // if this word is a whitespace or a control character, set 'seg' to true for next word, and break
       if (Character.isWhitespace(origText.charAt(i)) || Character.isISOControl(origText.charAt(i))) {
         seg = true;
-        continue;
       } else {
         // if this word is a word, put it as a feature label and set seg to false for next word
         wi.set(CoreAnnotations.ChineseCharAnnotation.class, wordString);
@@ -167,21 +155,18 @@ public void splitCharacters(CoreMap annotation) {
     }
 
     annotation.set(ChineseCoreAnnotations.CharactersAnnotation.class, words);
-    if (VERBOSE) {
-      System.err.println("output: " + words);
-    }    
   }
 
-  public void runSegmentation(CoreMap annotation) {
+  private void runSegmentation(CoreMap annotation) {
     //0 2
     // A BC D E
     // 1 10 1 1
     // 0 12 3 4
-    // 0, 0+1 , 
+    // 0, 0+1 ,
-    
+
     String text = annotation.get(CoreAnnotations.TextAnnotation.class);
     List<CoreLabel> sentChars = annotation.get(ChineseCoreAnnotations.CharactersAnnotation.class);
-    List<CoreLabel> tokens = new ArrayList<CoreLabel>();
+    List<CoreLabel> tokens = new ArrayList<>();
     annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
 
     List<String> words = segmenter.segmentString(text);
@@ -190,12 +175,12 @@ public void runSegmentation(CoreMap annotation) {
       System.err.println("--->");
       System.err.println(words);
     }
-    
+
     int pos = 0;
     for (String w : words) {
       CoreLabel fl = sentChars.get(pos);
       fl.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
-      if (w.length() == 0) {
+      if (w.isEmpty()) {
         continue;
       }
       CoreLabel token = new CoreLabel();
@@ -218,4 +203,5 @@ public Set<Requirement> requires() {
   public Set<Requirement> requirementsSatisfied() {
     return Collections.singleton(TOKENIZE_REQUIREMENT);
   }
+
 }