Tweak JSON schema; save docdates to proto

stanfordnlp · Jul 18, 2016 · 261862c · 261862c
1 parent 73d125d
commit 261862c
Show file tree

Hide file tree

Showing 33 changed files with 80,245 additions and 80,169 deletions.
diff --git a/itest/src/edu/stanford/nlp/pipeline/ArabicSegmenterAnnotatorITest.java b/itest/src/edu/stanford/nlp/pipeline/ArabicSegmenterAnnotatorITest.java
@@ -26,10 +26,10 @@ public void setUp()
   }
 
   public void testPipeline() {
-    String query = "وما هي كلمتُك المفضلة للدراسة؟";
-    String[] expectedWords = {"و", "ما", "هي", "كلمة", "ك", "المفضلة", "ل", "الدراسة", "?"};
-    int[] expectedStartPositions = {0, 1, 4, 7, 12, 14, 22, 23, 29};
-    int[] expectedEndPositions = {1, 3, 6, 11, 13, 21, 23, 29, 30};
+    String query = "وما هي كلمتك المفضلة؟";
+    String[] expectedWords = {"و", "ما", "هي", "كلمة", "ك", "المفضلة", "?"};
+    int[] expectedStartPositions = {0, 1, 4, 7, 11, 13, 20};
+    int[] expectedEndPositions = {1, 3, 6, 11, 12, 20, 21};
     Annotation annotation = new Annotation(query);
     pipeline.annotate(annotation);
 

diff --git a/itest/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializerSlowITest.java b/itest/src/edu/stanford/nlp/pipeline/ProtobufAnnotationSerializerSlowITest.java
@@ -197,7 +197,6 @@ public static void sameAsRead(Annotation doc, Annotation readDoc) {
             }
           }
         }
-      } else {
         assertTrue("Annotations don't match (don't know why?)", false);
       }
     }
@@ -437,6 +436,26 @@ public void testGender() {
   }
 
 
+  @Test
+  public void testDocDate() {
+    Annotation ann = new Annotation("hello world");
+    ann.set(CoreAnnotations.DocDateAnnotation.class, "2016-05-05");
+    ProtobufAnnotationSerializer serializer = new ProtobufAnnotationSerializer();
+    Annotation reread = serializer.fromProto(serializer.toProto(ann));
+    sameAsRead(ann, reread);
+  }
+
+
+  @Test
+  public void testCalendar() {
+    Annotation ann = new Annotation("hello world");
+    ann.set(CoreAnnotations.CalendarAnnotation.class, new GregorianCalendar());
+    ProtobufAnnotationSerializer serializer = new ProtobufAnnotationSerializer();
+    Annotation reread = serializer.fromProto(serializer.toProto(ann));
+    sameAsRead(ann, reread);
+  }
+
+
   @Test
   public void testShiftReduce() {
     testAnnotators("tokenize,ssplit,pos,parse",

diff --git a/src/edu/stanford/nlp/international/arabic/process/ArabicSegmenter.java b/src/edu/stanford/nlp/international/arabic/process/ArabicSegmenter.java
@@ -30,9 +30,7 @@
 import edu.stanford.nlp.stats.ClassicCounter;
 import edu.stanford.nlp.stats.Counter;
 import edu.stanford.nlp.trees.Tree;
-import edu.stanford.nlp.util.CollectionUtils;
 import edu.stanford.nlp.util.Generics;
-import edu.stanford.nlp.util.IntPair;
 import edu.stanford.nlp.util.PropertiesUtils;
 import edu.stanford.nlp.util.StringUtils;
 import edu.stanford.nlp.util.concurrent.MulticoreWrapper;
@@ -245,44 +243,18 @@ public List<HasWord> segment(String line) {
     return SentenceUtils.toWordList(segmentedString.split("\\s+"));
   }
 
-  private List<CoreLabel> segmentStringToIOB(String line) {
+  public String segmentString(String line) {
     List<CoreLabel> tokenList;
     if (tf == null) {
       // Whitespace tokenization.
       tokenList = IOBUtils.StringToIOB(line);
     } else {
       List<CoreLabel> tokens = tf.getTokenizer(new StringReader(line)).tokenize();
-      tokenList = IOBUtils.StringToIOB(tokens, null, false, tf, line);
+      tokenList = IOBUtils.StringToIOB(tokens, null, false);
     }
     IOBUtils.labelDomain(tokenList, domain);
     tokenList = classifier.classify(tokenList);
-    return tokenList;
-  }
-
-  public List<CoreLabel> segmentStringToTokenList(String line) {
-    List<CoreLabel> tokenList = CollectionUtils.makeList();
-    List<CoreLabel> labeledSequence = segmentStringToIOB(line);
-    for (IntPair span : IOBUtils.TokenSpansForIOB(labeledSequence)) {
-      CoreLabel token = new CoreLabel();
-      String text = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker,
-          span.getSource(), span.getTarget());
-      token.setWord(text);
-      token.setValue(text);
-      token.set(CoreAnnotations.TextAnnotation.class, text);
-      token.set(CoreAnnotations.ArabicSegAnnotation.class, "1");
-      int start = labeledSequence.get(span.getSource()).beginPosition();
-      int end = labeledSequence.get(span.getTarget() - 1).endPosition();
-      token.setOriginalText(line.substring(start, end));
-      token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, start);
-      token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, end);
-      tokenList.add(token);
-    }
-    return tokenList;
-  }
-
-  public String segmentString(String line) {
-    List<CoreLabel> labeledSequence = segmentStringToIOB(line);
-    String segmentedString = IOBUtils.IOBToString(labeledSequence, prefixMarker, suffixMarker);
+    String segmentedString = IOBUtils.IOBToString(tokenList, prefixMarker, suffixMarker);
     return segmentedString;
   }