First attempt at Chinese Simple CoreNLP

stanfordnlp · Jun 2, 2016 · adb5f3b · adb5f3b
1 parent b4ff3c7
commit adb5f3b
Show file tree

Hide file tree

Showing 5 changed files with 198 additions and 100 deletions.
diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java
@@ -13,7 +13,6 @@
 import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
 import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
 import edu.stanford.nlp.util.*;
-import org.hamcrest.beans.PropertyUtil;
 
 import javax.net.ssl.*;
 import java.io.*;
@@ -179,11 +178,9 @@ private Annotation getDocument(Properties props, HttpExchange httpExchange) thro
           encoding = defaultEncoding;
         }
 
-        String text = IOUtils.slurpReader(IOUtils.encodedInputStreamReader(httpExchange.getRequestBody(), encoding));
-        text = URLDecoder.decode(text, encoding).trim();
-        // TODO(chaganty): URLdecode string.
         // Read the annotation
-        return new Annotation(text);
+        return new Annotation(
+            IOUtils.slurpReader(IOUtils.encodedInputStreamReader(httpExchange.getRequestBody(), encoding)));
       case "serialized":
         String inputSerializerName = props.getProperty("inputSerializer", ProtobufAnnotationSerializer.class.getName());
         AnnotationSerializer serializer = MetaClass.create(inputSerializerName).createInstance();

diff --git a/src/edu/stanford/nlp/simple/ChineseDocument.java b/src/edu/stanford/nlp/simple/ChineseDocument.java
@@ -0,0 +1,66 @@
+package edu.stanford.nlp.simple;
+
+import edu.stanford.nlp.io.IOUtils;
+import edu.stanford.nlp.io.RuntimeIOException;
+import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.AnnotatorImplementations;
+import edu.stanford.nlp.pipeline.CoreNLPProtos;
+import edu.stanford.nlp.pipeline.StanfordCoreNLP;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.Properties;
+
+/**
+ * A sentence running with the Chinese models.
+ *
+ * @author <a href="mailto:gabor@eloquent.ai">Gabor Angeli</a>
+ */
+public class ChineseDocument extends Document {
+  /**
+   * An SLF4J Logger for this class.
+   */
+  private static final Logger log = LoggerFactory.getLogger(ChineseDocument.class);
+
+  /**
+   * The empty {@link java.util.Properties} object, for use with creating default annotators.
+   */
+  static final Properties EMPTY_PROPS = new Properties() {{
+    try {
+      load(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem("edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties"));
+    } catch (IOException e) {
+      throw new RuntimeIOException(e);
+    }
+    setProperty("language", "chinese");
+    setProperty("annotators", "");
+  }};
+
+  /**
+   * Create a new document from the passed in text.
+   * @param text The text of the document.
+   */
+  public ChineseDocument(String text) {
+    super(ChineseDocument.EMPTY_PROPS, text);
+  }
+
+  /**
+   * Convert a CoreNLP Annotation object to a Document.
+   * @param ann The CoreNLP Annotation object.
+   */
+  @SuppressWarnings("Convert2streamapi")
+  public ChineseDocument(Annotation ann) {
+    super(ChineseDocument.EMPTY_PROPS, ann);
+  }
+
+
+  /**
+   * Create a Document object from a read Protocol Buffer.
+   * @see edu.stanford.nlp.simple.Document#serialize()
+   * @param proto The protocol buffer representing this document.
+   */
+  public ChineseDocument(CoreNLPProtos.Document proto) {
+    super(ChineseDocument.EMPTY_PROPS, proto);
+  }
+
+}
diff --git a/src/edu/stanford/nlp/simple/ChineseSentence.java b/src/edu/stanford/nlp/simple/ChineseSentence.java
@@ -0,0 +1,64 @@
+package edu.stanford.nlp.simple;
+
+import edu.stanford.nlp.io.IOUtils;
+import edu.stanford.nlp.io.RuntimeIOException;
+import edu.stanford.nlp.pipeline.CoreNLPProtos;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Properties;
+
+/**
+ * A {@link Sentence}, but in Chinese.
+ *
+ * @author <a href="mailto:gabor@eloquent.ai">Gabor Angeli</a>
+ */
+public class ChineseSentence extends Sentence {
+  /**
+   * An SLF4J Logger for this class.
+   */
+  private static final Logger log = LoggerFactory.getLogger(ChineseSentence.class);
+
+  /** A properties object for creating a document from a single sentence. Used in the constructor {@link Sentence#Sentence(String)} */
+  static Properties SINGLE_SENTENCE_DOCUMENT = new Properties() {{
+    try {
+      load(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem("edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties"));
+    } catch (IOException e) {
+      throw new RuntimeIOException(e);
+    }
+    setProperty("language", "chinese");
+    setProperty("annotators", "");
+    setProperty("ssplit.isOneSentence", "true");
+    setProperty("tokenize.class", "PTBTokenizer");
+    setProperty("tokenize.language", "en");
+  }};
+
+  /** A properties object for creating a document from a single tokenized sentence. */
+  private static Properties SINGLE_SENTENCE_TOKENIZED_DOCUMENT = new Properties() {{
+    try {
+      load(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem("edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties"));
+    } catch (IOException e) {
+      throw new RuntimeIOException(e);
+    }
+    setProperty("language", "chinese");
+    setProperty("annotators", "");
+    setProperty("ssplit.isOneSentence", "true");
+    setProperty("tokenize.class", "WhitespaceTokenizer");
+    setProperty("tokenize.language", "en");
+    setProperty("tokenize.whitespace", "true");  // redundant?
+  }};
+
+  public ChineseSentence(String text) {
+    super(text, SINGLE_SENTENCE_DOCUMENT);
+  }
+
+  public ChineseSentence(List<String> tokens) {
+    super(tokens, SINGLE_SENTENCE_TOKENIZED_DOCUMENT);
+  }
+
+  public ChineseSentence(CoreNLPProtos.Sentence proto) {
+    super(proto, SINGLE_SENTENCE_DOCUMENT);
+  }
+}
diff --git a/src/edu/stanford/nlp/simple/Document.java b/src/edu/stanford/nlp/simple/Document.java
@@ -36,6 +36,7 @@ public class Document {
    * The empty {@link java.util.Properties} object, for use with creating default annotators.
    */
   static final Properties EMPTY_PROPS = new Properties() {{
+    setProperty("language", "english");
     setProperty("annotators", "");
     setProperty("tokenize.class", "PTBTokenizer");
     setProperty("tokenize.language", "en");
@@ -48,6 +49,7 @@ public class Document {
    * @see Sentence#caseless()
    */
   static final Properties CASELESS_PROPS = new Properties() {{
+    setProperty("language", "english");
     setProperty("annotators", "");
     setProperty("tokenize.class", "PTBTokenizer");
     setProperty("tokenize.language", "en");
@@ -67,6 +69,10 @@ public class Document {
    * The default {@link edu.stanford.nlp.pipeline.TokenizerAnnotator} implementation
    */
   private static final Annotator defaultTokenize = AnnotatorFactories.tokenize(EMPTY_PROPS, backend).create();
+  /**
+   * The default {@link ChineseSegmenterAnnotator} implementation
+   */
+  private static final Annotator chineseSegmenter = new ChineseSegmenterAnnotator(false);
   /**
    * The default {@link edu.stanford.nlp.pipeline.WordsToSentencesAnnotator} implementation
    */
@@ -317,23 +323,31 @@ public static void useServer(String host, int port) {
   }
 
 
+  /**
+   * Create a new document from the passed in text and the given propertiesj.
+   * @param text The text of the document.
+   */
+  public Document(Properties props, String text) {
+    StanfordCoreNLP.getDefaultAnnotatorPool(props, new AnnotatorImplementations());  // cache the annotator pool
+    this.impl = CoreNLPProtos.Document.newBuilder().setText(text);
+  }
+
 
   /**
    * Create a new document from the passed in text.
    * @param text The text of the document.
    */
   public Document(String text) {
-    StanfordCoreNLP.getDefaultAnnotatorPool(EMPTY_PROPS, new AnnotatorImplementations());  // cache the annotator pool
-    this.impl = CoreNLPProtos.Document.newBuilder().setText(text);
+    this(EMPTY_PROPS, text);
   }
 
   /**
    * Convert a CoreNLP Annotation object to a Document.
    * @param ann The CoreNLP Annotation object.
    */
   @SuppressWarnings("Convert2streamapi")
-  public Document(Annotation ann) {
-    StanfordCoreNLP.getDefaultAnnotatorPool(EMPTY_PROPS, new AnnotatorImplementations());  // cache the annotator pool
+  public Document(Properties props, Annotation ann) {
+    StanfordCoreNLP.getDefaultAnnotatorPool(props, new AnnotatorImplementations());  // cache the annotator pool
     this.impl = new ProtobufAnnotationSerializer(false).toProtoBuilder(ann);
     List<CoreMap> sentences = ann.get(CoreAnnotations.SentencesAnnotation.class);
     this.sentences = new ArrayList<>(sentences.size());
@@ -342,14 +356,20 @@ public Document(Annotation ann) {
     }
   }
 
+
+  /** @see Document#Document(Properties, Annotation) */
+  public Document(Annotation ann) {
+    this(Document.EMPTY_PROPS, ann);
+  }
+
   /**
    * Create a Document object from a read Protocol Buffer.
    * @see edu.stanford.nlp.simple.Document#serialize()
    * @param proto The protocol buffer representing this document.
    */
   @SuppressWarnings("Convert2streamapi")
-  public Document(CoreNLPProtos.Document proto) {
-    StanfordCoreNLP.getDefaultAnnotatorPool(EMPTY_PROPS, new AnnotatorImplementations());  // cache the annotator pool
+  public Document(Properties props, CoreNLPProtos.Document proto) {
+    StanfordCoreNLP.getDefaultAnnotatorPool(props, new AnnotatorImplementations());  // cache the annotator pool
     this.impl = proto.toBuilder();
     if (proto.getSentenceCount() > 0) {
       this.sentences = new ArrayList<>(proto.getSentenceCount());
@@ -360,6 +380,12 @@ public Document(CoreNLPProtos.Document proto) {
   }
 
 
+  /** @see Document#Document(Properties, CoreNLPProtos.Document)  */
+  public Document(CoreNLPProtos.Document proto) {
+    this(Document.EMPTY_PROPS, proto);
+  }
+
+
   /**
    * Make this document caseless. That is, from now on, run the caseless models
    * on the document by default rather than the standard CoreNLP models.
@@ -552,7 +578,12 @@ public final String xmlMinified(Function<Sentence, Object>... functions) {
   public List<Sentence> sentences(Properties props) {
     if (sentences == null) {
       // Get annotators
-      Annotator tokenizer = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultTokenize : AnnotatorFactories.tokenize(props, backend).create();
+      Annotator tokenizer;
+      if ("chinese".equals(props.getProperty("language"))) {
+        tokenizer = chineseSegmenter;
+      } else {
+        tokenizer = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultTokenize : AnnotatorFactories.tokenize(props, backend).create();
+      }
       Annotator ssplit = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultSSplit : AnnotatorFactories.sentenceSplit(props, backend).create();
       // Annotate
       Annotation ann = new Annotation(this.impl.getText());