Skip to content

Commit

Permalink
Rename word vectors class; buffer readers
Browse files Browse the repository at this point in the history
  • Loading branch information
Gabor Angeli authored and Stanford NLP committed Oct 16, 2015
1 parent d337483 commit 809b318
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 21 deletions.
Expand Up @@ -4,7 +4,6 @@

import java.io.File;
import java.io.IOException;
import java.util.HashMap;

import static org.junit.Assert.assertEquals;

Expand All @@ -13,16 +12,16 @@
*
* @author Gabor Angeli
*/
public class WordVectorsITest {
public class VectorMapITest {

@Test
public void testReadWord2Vec() throws IOException {
WordVectors vec = WordVectors.readWord2Vec("/scr/nlp/data/coref/wordvectors/en/vectors.txt.gz");
VectorMap vec = VectorMap.readWord2Vec("/scr/nlp/data/coref/wordvectors/en/vectors.txt.gz");
File tmp = File.createTempFile("word2vec", ".ser.gz");
System.err.println(tmp.getPath());
//tmp.deleteOnExit();
vec.serialize(tmp.getPath());
WordVectors reread = WordVectors.deserialize(tmp.getPath());
VectorMap reread = VectorMap.deserialize(tmp.getPath());
assertEquals(vec, reread);
}

Expand Down
13 changes: 6 additions & 7 deletions src/edu/stanford/nlp/hcoref/data/Dictionaries.java
Expand Up @@ -15,8 +15,7 @@
import edu.stanford.nlp.hcoref.CorefProperties;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.math.ArrayMath;
import edu.stanford.nlp.neural.WordVectors;
import edu.stanford.nlp.neural.VectorMap;
import edu.stanford.nlp.pipeline.DefaultPaths;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
Expand Down Expand Up @@ -203,7 +202,7 @@ private void readWordLists(Locale lang) {

public int dimVector;

public WordVectors vectors = new WordVectors();
public VectorMap vectors = new VectorMap();

public Map<String, String> strToEntity = Generics.newHashMap();
public Counter<String> dictScore = new ClassicCounter<String>();
Expand Down Expand Up @@ -537,10 +536,10 @@ public void loadSemantics(Properties props) throws ClassNotFoundException, IOExc
System.err.println("LOAD: WordVectors");
String wordvectorFile = CorefProperties.getPathSerializedWordVectors(props);
if(new File(wordvectorFile).exists()) {
vectors = WordVectors.deserialize(wordvectorFile);
vectors = VectorMap.deserialize(wordvectorFile);
dimVector = vectors.entrySet().iterator().next().getValue().length;
} else {
vectors = WordVectors.readWord2Vec(CorefProperties.getPathWord2Vec(props));
vectors = VectorMap.readWord2Vec(CorefProperties.getPathWord2Vec(props));
if (wordvectorFile != null && !wordvectorFile.startsWith("edu")) {
vectors.serialize(wordvectorFile);
}
Expand Down Expand Up @@ -573,8 +572,8 @@ public Dictionaries(Properties props) throws ClassNotFoundException, IOException
props.getProperty(CorefProperties.STATES_PROVINCES_PROP, DefaultPaths.DEFAULT_DCOREF_STATES_AND_PROVINCES),
CorefProperties.getSieves(props).contains("CorefDictionaryMatch"),
PropertiesUtils.getStringArray(props, CorefProperties.DICT_LIST_PROP,
new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2,
DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}),
new String[]{DefaultPaths.DEFAULT_DCOREF_DICT1, DefaultPaths.DEFAULT_DCOREF_DICT2,
DefaultPaths.DEFAULT_DCOREF_DICT3, DefaultPaths.DEFAULT_DCOREF_DICT4}),
props.getProperty(CorefProperties.DICT_PMI_PROP, DefaultPaths.DEFAULT_DCOREF_DICT1),
props.getProperty(CorefProperties.SIGNATURES_PROP, DefaultPaths.DEFAULT_DCOREF_NE_SIGNATURES));
if(CorefProperties.useSemantics(props)) {
Expand Down
Expand Up @@ -6,6 +6,7 @@
import java.io.*;
import java.util.HashMap;
import java.util.Map;
import java.util.zip.GZIPOutputStream;

/**
* A serializer for reading / writing word vectors.
Expand All @@ -14,20 +15,20 @@
*
* @author Gabor Angeli
*/
public class WordVectors extends HashMap<String, float[]>{
public class VectorMap extends HashMap<String, float[]>{

/**
* Create an empty word vector storage.
*/
public WordVectors() {
public VectorMap() {
super(1024);
}

/**
* Initialize word vectors from a given map.
* @param vectors The word vectors as a simple map.
*/
public WordVectors(Map<String, float[]> vectors) {
public VectorMap(Map<String, float[]> vectors) {
super(vectors);
}

Expand All @@ -39,8 +40,14 @@ public WordVectors(Map<String, float[]> vectors) {
* @throws IOException Thrown if the file could not be written to.
*/
public void serialize(String file) throws IOException {
try (OutputStream output = new FileOutputStream(new File(file))) {
serialize(output);
try (OutputStream output = new BufferedOutputStream(new FileOutputStream(new File(file)))) {
if (file.endsWith(".gz")) {
try (GZIPOutputStream gzip = new GZIPOutputStream(output)) {
serialize(gzip);
}
} else {
serialize(output);
}
}
}

Expand Down Expand Up @@ -81,7 +88,7 @@ public void serialize(OutputStream out) throws IOException {
* @return The vectors in the file.
* @throws IOException Thrown if we could not read from the resource
*/
public static WordVectors deserialize(String file) throws IOException {
public static VectorMap deserialize(String file) throws IOException {
try (InputStream input = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(file)) {
return deserialize(input);
}
Expand All @@ -94,11 +101,11 @@ public static WordVectors deserialize(String file) throws IOException {
* @return The word vectors encoded on the stream.
* @throws IOException Thrown if we could not read from the stream.
*/
public static WordVectors deserialize(InputStream in) throws IOException {
public static VectorMap deserialize(InputStream in) throws IOException {
DataInputStream dataIn = new DataInputStream(in);
int size = dataIn.readInt();
int dim = dataIn.readInt();
WordVectors vectors = new WordVectors();
VectorMap vectors = new VectorMap();
for (int i = 0; i < size; ++i) {
// Read the key
int strlen = dataIn.readInt();
Expand All @@ -125,8 +132,8 @@ public static WordVectors deserialize(InputStream in) throws IOException {
* @param file The word2vec text file.
* @return The word vectors in the file.
*/
public static WordVectors readWord2Vec(String file) {
WordVectors vectors = new WordVectors();
public static VectorMap readWord2Vec(String file) {
VectorMap vectors = new VectorMap();
int dim = -1;
for(String line : IOUtils.readLines(file)){
String[] split = line.toLowerCase().split("\\s+");
Expand Down

0 comments on commit 809b318

Please sign in to comment.