Rename word vectors class; buffer readers

stanfordnlp · Oct 16, 2015 · 809b318 · 809b318
1 parent d337483
commit 809b318
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 21 deletions.
diff --git a/...stanford/nlp/neural/WordVectorsITest.java → ...u/stanford/nlp/neural/VectorMapITest.java b/...stanford/nlp/neural/WordVectorsITest.java → ...u/stanford/nlp/neural/VectorMapITest.java
@@ -4,7 +4,6 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.HashMap;
 
 import static org.junit.Assert.assertEquals;
 
@@ -13,16 +12,16 @@
  *
  * @author Gabor Angeli
  */
-public class WordVectorsITest {
+public class VectorMapITest {
 
   @Test
   public void testReadWord2Vec() throws IOException {
-    WordVectors vec = WordVectors.readWord2Vec("/scr/nlp/data/coref/wordvectors/en/vectors.txt.gz");
+    VectorMap vec = VectorMap.readWord2Vec("/scr/nlp/data/coref/wordvectors/en/vectors.txt.gz");
     File tmp = File.createTempFile("word2vec", ".ser.gz");
     System.err.println(tmp.getPath());
     //tmp.deleteOnExit();
     vec.serialize(tmp.getPath());
-    WordVectors reread = WordVectors.deserialize(tmp.getPath());
+    VectorMap reread = VectorMap.deserialize(tmp.getPath());
     assertEquals(vec, reread);
   }
 

diff --git a/src/edu/stanford/nlp/hcoref/data/Dictionaries.java b/src/edu/stanford/nlp/hcoref/data/Dictionaries.java
@@ -15,8 +15,7 @@
 import edu.stanford.nlp.hcoref.CorefProperties;
 import edu.stanford.nlp.io.IOUtils;
 import edu.stanford.nlp.io.RuntimeIOException;
-import edu.stanford.nlp.math.ArrayMath;
-import edu.stanford.nlp.neural.WordVectors;
+import edu.stanford.nlp.neural.VectorMap;
 import edu.stanford.nlp.pipeline.DefaultPaths;
 import edu.stanford.nlp.stats.ClassicCounter;
 import edu.stanford.nlp.stats.Counter;
@@ -203,7 +202,7 @@ private void readWordLists(Locale lang) {
 
   public int dimVector;
 
-  public WordVectors vectors = new WordVectors();
+  public VectorMap vectors = new VectorMap();
 
   public Map<String, String> strToEntity = Generics.newHashMap();
   public Counter<String> dictScore = new ClassicCounter<String>();
@@ -537,10 +536,10 @@ public void loadSemantics(Properties props) throws ClassNotFoundException, IOExc
       System.err.println("LOAD: WordVectors");
       String wordvectorFile = CorefProperties.getPathSerializedWordVectors(props);
       if(new File(wordvectorFile).exists()) {
-        vectors = WordVectors.deserialize(wordvectorFile);
+        vectors = VectorMap.deserialize(wordvectorFile);
         dimVector = vectors.entrySet().iterator().next().getValue().length;
       } else {
-        vectors = WordVectors.readWord2Vec(CorefProperties.getPathWord2Vec(props));
+        vectors = VectorMap.readWord2Vec(CorefProperties.getPathWord2Vec(props));
         if (wordvectorFile != null && !wordvectorFile.startsWith("edu")) {
           vectors.serialize(wordvectorFile);
         }
@@ -573,8 +572,8 @@ public Dictionaries(Properties props) throws ClassNotFoundException, IOException
         props.getProperty(CorefProperties.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES),
         CorefProperties.getSieves(props).contains("CorefDictionaryMatch"),
         PropertiesUtils.getStringArray(props, CorefProperties.DICT_LIST_PROP,
-                                       new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2,
-                                                    DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}),
+            new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2,
+                DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}),
         props.getProperty(CorefProperties.DICT_PMI_PROP, DefaultPaths.DEFAULT_DCOREF_DICT1),
         props.getProperty(CorefProperties.SIGNATURES_PROP, DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES));
     if(CorefProperties.useSemantics(props)) {

diff --git a/src/edu/stanford/nlp/neural/WordVectors.java → src/edu/stanford/nlp/neural/VectorMap.java b/src/edu/stanford/nlp/neural/WordVectors.java → src/edu/stanford/nlp/neural/VectorMap.java
@@ -6,6 +6,7 @@
 import java.io.*;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.zip.GZIPOutputStream;
 
 /**
  * A serializer for reading / writing word vectors.
@@ -14,20 +15,20 @@
  *
  * @author Gabor Angeli
  */
-public class WordVectors extends HashMap<String, float[]>{
+public class VectorMap extends HashMap<String, float[]>{
 
   /**
    * Create an empty word vector storage.
    */
-  public WordVectors() {
+  public VectorMap() {
     super(1024);
   }
 
   /**
    * Initialize word vectors from a given map.
    * @param vectors The word vectors as a simple map.
    */
-  public WordVectors(Map<String, float[]> vectors) {
+  public VectorMap(Map<String, float[]> vectors) {
     super(vectors);
   }
 
@@ -39,8 +40,14 @@ public WordVectors(Map<String, float[]> vectors) {
    * @throws IOException Thrown if the file could not be written to.
    */
   public void serialize(String file) throws IOException {
-    try (OutputStream output = new FileOutputStream(new File(file))) {
-      serialize(output);
+    try (OutputStream output = new BufferedOutputStream(new FileOutputStream(new File(file)))) {
+      if (file.endsWith(".gz")) {
+        try (GZIPOutputStream gzip = new GZIPOutputStream(output)) {
+          serialize(gzip);
+        }
+      } else {
+        serialize(output);
+      }
     }
   }
 
@@ -81,7 +88,7 @@ public void serialize(OutputStream out) throws IOException {
    * @return The vectors in the file.
    * @throws IOException Thrown if we could not read from the resource
    */
-  public static WordVectors deserialize(String file) throws IOException {
+  public static VectorMap deserialize(String file) throws IOException {
     try (InputStream input = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(file)) {
       return deserialize(input);
     }
@@ -94,11 +101,11 @@ public static WordVectors deserialize(String file) throws IOException {
    * @return The word vectors encoded on the stream.
    * @throws IOException Thrown if we could not read from the stream.
    */
-  public static WordVectors deserialize(InputStream in) throws IOException {
+  public static VectorMap deserialize(InputStream in) throws IOException {
     DataInputStream dataIn = new DataInputStream(in);
     int size = dataIn.readInt();
     int dim = dataIn.readInt();
-    WordVectors vectors = new WordVectors();
+    VectorMap vectors = new VectorMap();
     for (int i = 0; i < size; ++i) {
       // Read the key
       int strlen = dataIn.readInt();
@@ -125,8 +132,8 @@ public static WordVectors deserialize(InputStream in) throws IOException {
    * @param file The word2vec text file.
    * @return The word vectors in the file.
    */
-  public static WordVectors readWord2Vec(String file) {
-    WordVectors vectors = new WordVectors();
+  public static VectorMap readWord2Vec(String file) {
+    VectorMap vectors = new VectorMap();
     int dim = -1;
     for(String line : IOUtils.readLines(file)){
       String[] split = line.toLowerCase().split("\\s+");