Skip to content

Commit

Permalink
Tweak JSON schema; save docdates to proto
Browse files Browse the repository at this point in the history
  • Loading branch information
Gabor Angeli authored and Stanford NLP committed Jul 18, 2016
1 parent 73d125d commit 261862c
Show file tree
Hide file tree
Showing 33 changed files with 80,245 additions and 80,169 deletions.
Expand Up @@ -26,10 +26,10 @@ public void setUp()
}

public void testPipeline() {
String query = "وما هي كلمتُك المفضلة للدراسة؟";
String[] expectedWords = {"و", "ما", "هي", "كلمة", "ك", "المفضلة", "ل", "الدراسة", "?"};
int[] expectedStartPositions = {0, 1, 4, 7, 12, 14, 22, 23, 29};
int[] expectedEndPositions = {1, 3, 6, 11, 13, 21, 23, 29, 30};
String query = "وما هي كلمتك المفضلة؟";
String[] expectedWords = {"و", "ما", "هي", "كلمة", "ك", "المفضلة", "?"};
int[] expectedStartPositions = {0, 1, 4, 7, 11, 13, 20};
int[] expectedEndPositions = {1, 3, 6, 11, 12, 20, 21};
Annotation annotation = new Annotation(query);
pipeline.annotate(annotation);

Expand Down
Expand Up @@ -197,7 +197,6 @@ public static void sameAsRead(Annotation doc, Annotation readDoc) {
}
}
}
} else {
assertTrue("Annotations don't match (don't know why?)", false);
}
}
Expand Down Expand Up @@ -437,6 +436,26 @@ public void testGender() {
}


@Test
public void testDocDate() {
Annotation ann = new Annotation("hello world");
ann.set(CoreAnnotations.DocDateAnnotation.class, "2016-05-05");
ProtobufAnnotationSerializer serializer = new ProtobufAnnotationSerializer();
Annotation reread = serializer.fromProto(serializer.toProto(ann));
sameAsRead(ann, reread);
}


@Test
public void testCalendar() {
Annotation ann = new Annotation("hello world");
ann.set(CoreAnnotations.CalendarAnnotation.class, new GregorianCalendar());
ProtobufAnnotationSerializer serializer = new ProtobufAnnotationSerializer();
Annotation reread = serializer.fromProto(serializer.toProto(ann));
sameAsRead(ann, reread);
}


@Test
public void testShiftReduce() {
testAnnotators("tokenize,ssplit,pos,parse",
Expand Down
Expand Up @@ -30,9 +30,7 @@
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.CollectionUtils;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.concurrent.MulticoreWrapper;
Expand Down Expand Up @@ -245,44 +243,18 @@ public List<HasWord> segment(String line) {
return SentenceUtils.toWordList(segmentedString.split("\\s+"));
}

private List<CoreLabel> segmentStringToIOB(String line) {
public String segmentString(String line) {
List<CoreLabel> tokenList;
if (tf == null) {
// Whitespace tokenization.
tokenList = IOBUtils.StringToIOB(line);
} else {
List<CoreLabel> tokens = tf.getTokenizer(new StringReader(line)).tokenize();
tokenList = IOBUtils.StringToIOB(tokens, null, false, tf, line);
tokenList = IOBUtils.StringToIOB(tokens, null, false);
}
IOBUtils.labelDomain(tokenList, domain);
tokenList = classifier.classify(tokenList);
return tokenList;
}

public List<CoreLabel> segmentStringToTokenList(String line) {
List<CoreLabel> tokenList = CollectionUtils.makeList();
List<CoreLabel> labeledSequence = segmentStringToIOB(line);
for (IntPair span : IOBUtils.TokenSpansForIOB(labeledSequence)) {
CoreLabel token = new CoreLabel();
String text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker,
span.getSource(), span.getTarget());
token.setWord(text);
token.setValue(text);
token.set(CoreAnnotations.TextAnnotation.class, text);
token.set(CoreAnnotations.ArabicSegAnnotation.class, "1");
int start = labeledSequence.get(span.getSource()).beginPosition();
int end = labeledSequence.get(span.getTarget() - 1).endPosition();
token.setOriginalText(line.substring(start, end));
token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, start);
token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
tokenList.add(token);
}
return tokenList;
}

public String segmentString(String line) {
List<CoreLabel> labeledSequence = segmentStringToIOB(line);
String segmentedString = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker);
String segmentedString = IOBUtils.IOBToString(tokenList, prefixMarker, suffixMarker);
return segmentedString;
}

Expand Down

0 comments on commit 261862c

Please sign in to comment.