Skip to content

Commit

Permalink
First attempt at Chinese Simple CoreNLP
Browse files Browse the repository at this point in the history
  • Loading branch information
Gabor Angeli authored and Stanford NLP committed Jun 2, 2016
1 parent b4ff3c7 commit adb5f3b
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 100 deletions.
7 changes: 2 additions & 5 deletions src/edu/stanford/nlp/pipeline/StanfordCoreNLPServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;
import edu.stanford.nlp.semgraph.semgrex.SemgrexPattern;
import edu.stanford.nlp.util.*;
import org.hamcrest.beans.PropertyUtil;

import javax.net.ssl.*;
import java.io.*;
Expand Down Expand Up @@ -179,11 +178,9 @@ private Annotation getDocument(Properties props, HttpExchange httpExchange) thro
encoding = defaultEncoding;
}

String text = IOUtils.slurpReader(IOUtils.encodedInputStreamReader(httpExchange.getRequestBody(), encoding));
text = URLDecoder.decode(text, encoding).trim();
// TODO(chaganty): URLdecode string.
// Read the annotation
return new Annotation(text);
return new Annotation(
IOUtils.slurpReader(IOUtils.encodedInputStreamReader(httpExchange.getRequestBody(), encoding)));
case "serialized":
String inputSerializerName = props.getProperty("inputSerializer", ProtobufAnnotationSerializer.class.getName());
AnnotationSerializer serializer = MetaClass.create(inputSerializerName).createInstance();
Expand Down
66 changes: 66 additions & 0 deletions src/edu/stanford/nlp/simple/ChineseDocument.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package edu.stanford.nlp.simple;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.AnnotatorImplementations;
import edu.stanford.nlp.pipeline.CoreNLPProtos;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Properties;

/**
* A sentence running with the Chinese models.
*
* @author <a href="mailto:gabor@eloquent.ai">Gabor Angeli</a>
*/
public class ChineseDocument extends Document {
/**
* An SLF4J Logger for this class.
*/
private static final Logger log = LoggerFactory.getLogger(ChineseDocument.class);

/**
* The empty {@link java.util.Properties} object, for use with creating default annotators.
*/
static final Properties EMPTY_PROPS = new Properties() {{
try {
load(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem("edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties"));
} catch (IOException e) {
throw new RuntimeIOException(e);
}
setProperty("language", "chinese");
setProperty("annotators", "");
}};

/**
* Create a new document from the passed in text.
* @param text The text of the document.
*/
public ChineseDocument(String text) {
super(ChineseDocument.EMPTY_PROPS, text);
}

/**
* Convert a CoreNLP Annotation object to a Document.
* @param ann The CoreNLP Annotation object.
*/
@SuppressWarnings("Convert2streamapi")
public ChineseDocument(Annotation ann) {
super(ChineseDocument.EMPTY_PROPS, ann);
}


/**
* Create a Document object from a read Protocol Buffer.
* @see edu.stanford.nlp.simple.Document#serialize()
* @param proto The protocol buffer representing this document.
*/
public ChineseDocument(CoreNLPProtos.Document proto) {
super(ChineseDocument.EMPTY_PROPS, proto);
}

}
64 changes: 64 additions & 0 deletions src/edu/stanford/nlp/simple/ChineseSentence.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package edu.stanford.nlp.simple;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.pipeline.CoreNLPProtos;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.List;
import java.util.Properties;

/**
* A {@link Sentence}, but in Chinese.
*
* @author <a href="mailto:gabor@eloquent.ai">Gabor Angeli</a>
*/
public class ChineseSentence extends Sentence {
/**
* An SLF4J Logger for this class.
*/
private static final Logger log = LoggerFactory.getLogger(ChineseSentence.class);

/** A properties object for creating a document from a single sentence. Used in the constructor {@link Sentence#Sentence(String)} */
static Properties SINGLE_SENTENCE_DOCUMENT = new Properties() {{
try {
load(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem("edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties"));
} catch (IOException e) {
throw new RuntimeIOException(e);
}
setProperty("language", "chinese");
setProperty("annotators", "");
setProperty("ssplit.isOneSentence", "true");
setProperty("tokenize.class", "PTBTokenizer");
setProperty("tokenize.language", "en");
}};

/** A properties object for creating a document from a single tokenized sentence. */
private static Properties SINGLE_SENTENCE_TOKENIZED_DOCUMENT = new Properties() {{
try {
load(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem("edu/stanford/nlp/pipeline/StanfordCoreNLP-chinese.properties"));
} catch (IOException e) {
throw new RuntimeIOException(e);
}
setProperty("language", "chinese");
setProperty("annotators", "");
setProperty("ssplit.isOneSentence", "true");
setProperty("tokenize.class", "WhitespaceTokenizer");
setProperty("tokenize.language", "en");
setProperty("tokenize.whitespace", "true"); // redundant?
}};

public ChineseSentence(String text) {
super(text, SINGLE_SENTENCE_DOCUMENT);
}

public ChineseSentence(List<String> tokens) {
super(tokens, SINGLE_SENTENCE_TOKENIZED_DOCUMENT);
}

public ChineseSentence(CoreNLPProtos.Sentence proto) {
super(proto, SINGLE_SENTENCE_DOCUMENT);
}
}
45 changes: 38 additions & 7 deletions src/edu/stanford/nlp/simple/Document.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ public class Document {
* The empty {@link java.util.Properties} object, for use with creating default annotators.
*/
static final Properties EMPTY_PROPS = new Properties() {{
setProperty("language", "english");
setProperty("annotators", "");
setProperty("tokenize.class", "PTBTokenizer");
setProperty("tokenize.language", "en");
Expand All @@ -48,6 +49,7 @@ public class Document {
* @see Sentence#caseless()
*/
static final Properties CASELESS_PROPS = new Properties() {{
setProperty("language", "english");
setProperty("annotators", "");
setProperty("tokenize.class", "PTBTokenizer");
setProperty("tokenize.language", "en");
Expand All @@ -67,6 +69,10 @@ public class Document {
* The default {@link edu.stanford.nlp.pipeline.TokenizerAnnotator} implementation
*/
private static final Annotator defaultTokenize = AnnotatorFactories.tokenize(EMPTY_PROPS, backend).create();
/**
* The default {@link ChineseSegmenterAnnotator} implementation
*/
private static final Annotator chineseSegmenter = new ChineseSegmenterAnnotator(false);
/**
* The default {@link edu.stanford.nlp.pipeline.WordsToSentencesAnnotator} implementation
*/
Expand Down Expand Up @@ -317,23 +323,31 @@ public static void useServer(String host, int port) {
}


/**
* Create a new document from the passed in text and the given propertiesj.
* @param text The text of the document.
*/
public Document(Properties props, String text) {
StanfordCoreNLP.getDefaultAnnotatorPool(props, new AnnotatorImplementations()); // cache the annotator pool
this.impl = CoreNLPProtos.Document.newBuilder().setText(text);
}


/**
* Create a new document from the passed in text.
* @param text The text of the document.
*/
public Document(String text) {
StanfordCoreNLP.getDefaultAnnotatorPool(EMPTY_PROPS, new AnnotatorImplementations()); // cache the annotator pool
this.impl = CoreNLPProtos.Document.newBuilder().setText(text);
this(EMPTY_PROPS, text);
}

/**
* Convert a CoreNLP Annotation object to a Document.
* @param ann The CoreNLP Annotation object.
*/
@SuppressWarnings("Convert2streamapi")
public Document(Annotation ann) {
StanfordCoreNLP.getDefaultAnnotatorPool(EMPTY_PROPS, new AnnotatorImplementations()); // cache the annotator pool
public Document(Properties props, Annotation ann) {
StanfordCoreNLP.getDefaultAnnotatorPool(props, new AnnotatorImplementations()); // cache the annotator pool
this.impl = new ProtobufAnnotationSerializer(false).toProtoBuilder(ann);
List<CoreMap> sentences = ann.get(CoreAnnotations.SentencesAnnotation.class);
this.sentences = new ArrayList<>(sentences.size());
Expand All @@ -342,14 +356,20 @@ public Document(Annotation ann) {
}
}


/** @see Document#Document(Properties, Annotation) */
public Document(Annotation ann) {
this(Document.EMPTY_PROPS, ann);
}

/**
* Create a Document object from a read Protocol Buffer.
* @see edu.stanford.nlp.simple.Document#serialize()
* @param proto The protocol buffer representing this document.
*/
@SuppressWarnings("Convert2streamapi")
public Document(CoreNLPProtos.Document proto) {
StanfordCoreNLP.getDefaultAnnotatorPool(EMPTY_PROPS, new AnnotatorImplementations()); // cache the annotator pool
public Document(Properties props, CoreNLPProtos.Document proto) {
StanfordCoreNLP.getDefaultAnnotatorPool(props, new AnnotatorImplementations()); // cache the annotator pool
this.impl = proto.toBuilder();
if (proto.getSentenceCount() > 0) {
this.sentences = new ArrayList<>(proto.getSentenceCount());
Expand All @@ -360,6 +380,12 @@ public Document(CoreNLPProtos.Document proto) {
}


/** @see Document#Document(Properties, CoreNLPProtos.Document) */
public Document(CoreNLPProtos.Document proto) {
this(Document.EMPTY_PROPS, proto);
}


/**
* Make this document caseless. That is, from now on, run the caseless models
* on the document by default rather than the standard CoreNLP models.
Expand Down Expand Up @@ -552,7 +578,12 @@ public final String xmlMinified(Function<Sentence, Object>... functions) {
public List<Sentence> sentences(Properties props) {
if (sentences == null) {
// Get annotators
Annotator tokenizer = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultTokenize : AnnotatorFactories.tokenize(props, backend).create();
Annotator tokenizer;
if ("chinese".equals(props.getProperty("language"))) {
tokenizer = chineseSegmenter;
} else {
tokenizer = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultTokenize : AnnotatorFactories.tokenize(props, backend).create();
}
Annotator ssplit = (props == EMPTY_PROPS || props == SINGLE_SENTENCE_DOCUMENT) ? defaultSSplit : AnnotatorFactories.sentenceSplit(props, backend).create();
// Annotate
Annotation ann = new Annotation(this.impl.getText());
Expand Down
Loading

0 comments on commit adb5f3b

Please sign in to comment.