Skip to content

Commit

Permalink
Update the names of the experimental factory functions for vocab and …
Browse files Browse the repository at this point in the history
…vectors (#1029)
  • Loading branch information
zhangguanheng66 committed Nov 7, 2020
1 parent 88e55d7 commit 94ec092
Show file tree
Hide file tree
Showing 11 changed files with 70 additions and 72 deletions.
10 changes: 5 additions & 5 deletions benchmark/benchmark_experimental_vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from torchtext.experimental.datasets import AG_NEWS
from torchtext.experimental.vocab import (
vocab as VocabExperimental,
vocab_from_file,
vocab_from_raw_text_file
load_vocab_from_file,
build_vocab_from_text_file
)
from torchtext.vocab import (
Vocab,
Expand Down Expand Up @@ -68,11 +68,11 @@ def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True,
for _ in range(num_iters):
tokenizer = basic_english_normalize()
jited_tokenizer = torch.jit.script(tokenizer.to_ivalue())
vocab_from_raw_text_file(f, jited_tokenizer, num_cpus=1)
build_vocab_from_text_file(f, jited_tokenizer, num_cpus=1)
print("Construction time:", time.monotonic() - t0)
else:
for _ in range(num_iters):
vocab_from_file(f)
load_vocab_from_file(f)
print("Construction time:", time.monotonic() - t0)


Expand Down Expand Up @@ -121,7 +121,7 @@ def token_iterator(file_path):
print("Vocab Experimental")
t0 = time.monotonic()
f = open(vocab_file_path, 'r')
v_experimental = vocab_from_file(f)
v_experimental = load_vocab_from_file(f)
print("Construction time:", time.monotonic() - t0)
else:
print("Loading Vocab from AG News")
Expand Down
16 changes: 8 additions & 8 deletions docs/source/experimental_vectors.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,22 @@ torchtext.experimental.vectors
:members:
:special-members:

:hidden:`vectors`
~~~~~~~~~~~~~~~~~
:hidden:`build_vectors`
~~~~~~~~~~~~~~~~~~~~~~~

.. autofunction:: vectors
.. autofunction:: build_vectors

:hidden:`vectors_from_file_object`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:hidden:`load_vectors_from_file_path`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autofunction:: vectors_from_file_object
.. autofunction:: load_vectors_from_file_path

:hidden:`FastText`
~~~~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~~~~

.. autofunction:: FastText

:hidden:`GloVe`
~~~~~~~~~~~~~~~~
~~~~~~~~~~~~~~~

.. autofunction:: GloVe
14 changes: 7 additions & 7 deletions docs/source/experimental_vocab.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
:class: hidden-section

torchtext.experimental.vocab
==============================
============================

.. automodule:: torchtext.experimental.vocab
.. currentmodule:: torchtext.experimental.vocab
Expand All @@ -19,15 +19,15 @@ torchtext.experimental.vocab

.. autofunction:: vocab

:hidden:`vocab_from_file`
~~~~~~~~~~~~~~~~~~~~~~~~~
:hidden:`load_vocab_from_file`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autofunction:: vocab_from_file
.. autofunction:: load_vocab_from_file

:hidden:`vocab_from_raw_text_file`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
:hidden:`build_vocab_from_text_file`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autofunction:: vocab_from_raw_text_file
.. autofunction:: build_vocab_from_text_file

:hidden:`build_vocab_from_iterator`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
4 changes: 2 additions & 2 deletions examples/data_pipeline/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
sequential_transforms,
)
from torchtext.experimental.vectors import FastText as FastTextExperimental
from torchtext.experimental.vocab import vocab_from_file
from torchtext.experimental.vocab import load_vocab_from_file
from torchtext.vocab import FastText

import argparse
Expand Down Expand Up @@ -57,7 +57,7 @@ def token_iterator(vocab_file):
def build_experimental_torchtext_pipeline(hf_vocab_file):
tokenizer = basic_english_normalize()
with open(hf_vocab_file, 'r') as f:
vocab = vocab_from_file(f)
vocab = load_vocab_from_file(f)
pipeline = TextSequentialTransforms(tokenizer, vocab)
jit_pipeline = torch.jit.script(pipeline.to_ivalue())
print('jit experimental torchtext pipeline success!')
Expand Down
23 changes: 11 additions & 12 deletions test/experimental/test_transforms_with_asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,17 @@
TextSequentialTransforms,
)
from torchtext.experimental.vocab import (
vocab_from_file,
vocab_from_raw_text_file,
load_vocab_from_file,
build_vocab_from_text_file,
)
import shutil
import tempfile
import os
from torchtext.experimental.vectors import (
GloVe,
vectors,
build_vectors,
FastText,
vectors_from_file_object,
load_vectors_from_file_path,
)
from torchtext.utils import download_from_url

Expand All @@ -30,7 +30,7 @@ def test_vocab_transform(self):
asset_name = 'vocab_test2.txt'
asset_path = get_asset_path(asset_name)
with open(asset_path, 'r') as f:
vocab_transform = VocabTransform(vocab_from_file(f))
vocab_transform = VocabTransform(load_vocab_from_file(f))
self.assertEqual(vocab_transform(['of', 'that', 'new']),
[7, 18, 24])
jit_vocab_transform = torch.jit.script(vocab_transform.to_ivalue())
Expand All @@ -44,15 +44,15 @@ def test_errors_vectors_python(self):
with self.assertRaises(ValueError):
# Test proper error raised when passing in empty tokens and vectors and
# not passing in a user defined unk_tensor
vectors(tokens, vecs)
build_vectors(tokens, vecs)

tensorA = torch.tensor([1, 0, 0], dtype=torch.int8)
tokens = ['a']
vecs = tensorA.unsqueeze(0)

with self.assertRaises(TypeError):
# Test proper error raised when vector is not of type torch.float
vectors(tokens, vecs)
build_vectors(tokens, vecs)

with tempfile.TemporaryDirectory() as dir_name:
# Test proper error raised when incorrect filename or dim passed into GloVe
Expand Down Expand Up @@ -131,7 +131,7 @@ def test_vocab_from_file(self):
asset_name = 'vocab_test.txt'
asset_path = get_asset_path(asset_name)
with open(asset_path, 'r') as f:
v = vocab_from_file(f, unk_token='<new_unk>')
v = load_vocab_from_file(f, unk_token='<new_unk>')
expected_itos = ['<new_unk>', 'b', 'a', 'c']
expected_stoi = {x: index for index, x in enumerate(expected_itos)}
self.assertEqual(v.get_itos(), expected_itos)
Expand All @@ -143,7 +143,7 @@ def test_vocab_from_raw_text_file(self):
with open(asset_path, 'r') as f:
tokenizer = basic_english_normalize()
jit_tokenizer = torch.jit.script(tokenizer.to_ivalue())
v = vocab_from_raw_text_file(f, jit_tokenizer, unk_token='<new_unk>')
v = build_vocab_from_text_file(f, jit_tokenizer, unk_token='<new_unk>')
expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
Expand Down Expand Up @@ -173,16 +173,15 @@ def test_text_sequential_transform(self):
asset_name = 'vocab_test2.txt'
asset_path = get_asset_path(asset_name)
with open(asset_path, 'r') as f:
pipeline = TextSequentialTransforms(basic_english_normalize(), vocab_from_file(f))
pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(f))
jit_pipeline = torch.jit.script(pipeline.to_ivalue())
self.assertEqual(pipeline('of that new'), [7, 18, 24])
self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])

def test_vectors_from_file(self):
asset_name = 'vectors_test.csv'
asset_path = get_asset_path(asset_name)
f = open(asset_path, 'r')
vectors_obj = vectors_from_file_object(f)
vectors_obj = load_vectors_from_file_path(asset_path)

expected_tensorA = torch.tensor([1, 0, 0], dtype=torch.float)
expected_tensorB = torch.tensor([0, 1, 0], dtype=torch.float)
Expand Down
20 changes: 10 additions & 10 deletions test/experimental/test_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import unittest
from test.common.torchtext_test_case import TorchtextTestCase
from torchtext.experimental.vectors import (
vectors,
build_vectors,
)


Expand All @@ -20,7 +20,7 @@ def test_empty_vectors(self):
vecs = torch.empty(0, dtype=torch.float)
unk_tensor = torch.tensor([0], dtype=torch.float)

vectors_obj = vectors(tokens, vecs, unk_tensor)
vectors_obj = build_vectors(tokens, vecs, unk_tensor)
self.assertEqual(vectors_obj['not_in_it'], unk_tensor)

def test_empty_unk(self):
Expand All @@ -29,7 +29,7 @@ def test_empty_unk(self):

tokens = ['a']
vecs = tensorA.unsqueeze(0)
vectors_obj = vectors(tokens, vecs)
vectors_obj = build_vectors(tokens, vecs)

self.assertEqual(vectors_obj['not_in_it'], expected_unk_tensor)

Expand All @@ -40,7 +40,7 @@ def test_vectors_basic(self):
unk_tensor = torch.tensor([0, 0], dtype=torch.float)
tokens = ['a', 'b']
vecs = torch.stack((tensorA, tensorB), 0)
vectors_obj = vectors(tokens, vecs, unk_tensor=unk_tensor)
vectors_obj = build_vectors(tokens, vecs, unk_tensor=unk_tensor)

self.assertEqual(vectors_obj['a'], tensorA)
self.assertEqual(vectors_obj['b'], tensorB)
Expand All @@ -53,7 +53,7 @@ def test_vectors_jit(self):
unk_tensor = torch.tensor([0, 0], dtype=torch.float)
tokens = ['a', 'b']
vecs = torch.stack((tensorA, tensorB), 0)
vectors_obj = vectors(tokens, vecs, unk_tensor=unk_tensor)
vectors_obj = build_vectors(tokens, vecs, unk_tensor=unk_tensor)
jit_vectors_obj = torch.jit.script(vectors_obj.to_ivalue())

assert not vectors_obj.is_jitable
Expand All @@ -70,7 +70,7 @@ def test_vectors_forward(self):
unk_tensor = torch.tensor([0, 0], dtype=torch.float)
tokens = ['a', 'b']
vecs = torch.stack((tensorA, tensorB), 0)
vectors_obj = vectors(tokens, vecs, unk_tensor=unk_tensor)
vectors_obj = build_vectors(tokens, vecs, unk_tensor=unk_tensor)
jit_vectors_obj = torch.jit.script(vectors_obj.to_ivalue())

tokens_to_lookup = ['a', 'b', 'c']
Expand All @@ -88,7 +88,7 @@ def test_vectors_lookup_vectors(self):
unk_tensor = torch.tensor([0, 0], dtype=torch.float)
tokens = ['a', 'b']
vecs = torch.stack((tensorA, tensorB), 0)
vectors_obj = vectors(tokens, vecs, unk_tensor=unk_tensor)
vectors_obj = build_vectors(tokens, vecs, unk_tensor=unk_tensor)

tokens_to_lookup = ['a', 'b', 'c']
expected_vectors = torch.stack((tensorA, tensorB, unk_tensor), 0)
Expand All @@ -102,7 +102,7 @@ def test_vectors_add_item(self):

tokens = ['a']
vecs = tensorA.unsqueeze(0)
vectors_obj = vectors(tokens, vecs, unk_tensor=unk_tensor)
vectors_obj = build_vectors(tokens, vecs, unk_tensor=unk_tensor)

tensorB = torch.tensor([0, 1], dtype=torch.float)
vectors_obj['b'] = tensorB
Expand All @@ -118,7 +118,7 @@ def test_vectors_load_and_save(self):

tokens = ['a', 'b']
vecs = torch.stack((tensorA, tensorB), 0)
vectors_obj = vectors(tokens, vecs)
vectors_obj = build_vectors(tokens, vecs)

tensorC = torch.tensor([1, 1], dtype=torch.float)
vectors_obj['b'] = tensorC
Expand All @@ -145,4 +145,4 @@ def test_errors_vectors_cpp(self):
# Test proper error raised when tokens have duplicates
# TODO: use self.assertRaisesRegex() to check
# the key of the duplicate token in the error message
vectors(tokens, vecs)
build_vectors(tokens, vecs)
2 changes: 1 addition & 1 deletion torchtext/csrc/register_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ PYBIND11_MODULE(_torchtext, m) {
m.def("_load_token_and_vectors_from_file",
&_load_token_and_vectors_from_file);
m.def("_load_vocab_from_file", &_load_vocab_from_file);
m.def("_load_vocab_from_raw_text_file", _load_vocab_from_raw_text_file);
m.def("_build_vocab_from_text_file", _build_vocab_from_text_file);
}

// Registers our custom classes with torch.
Expand Down
8 changes: 4 additions & 4 deletions torchtext/csrc/vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,10 +330,10 @@ Vocab _load_vocab_from_file(const std::string &file_path,
return Vocab(std::move(tokens), std::move(stoi), unk_token, unk_index);
}

Vocab _load_vocab_from_raw_text_file(const std::string &file_path,
const std::string &unk_token,
const int64_t min_freq,
const int64_t num_cpus, py::object fn) {
Vocab _build_vocab_from_text_file(const std::string &file_path,
const std::string &unk_token,
const int64_t min_freq,
const int64_t num_cpus, py::object fn) {
std::cerr << "[INFO] Reading file " << file_path << std::endl;

torch::jit::script::Module module(*torch::jit::as_module(fn));
Expand Down
10 changes: 5 additions & 5 deletions torchtext/csrc/vocab.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ VocabStates _set_vocab_states(const c10::intrusive_ptr<Vocab> &self);
Vocab _load_vocab_from_file(const std::string &file_path,
const std::string &unk_token,
const int64_t min_freq, const int64_t num_cpus);
Vocab _load_vocab_from_raw_text_file(const std::string &file_path,
const std::string &unk_token,
const int64_t min_freq,
const int64_t num_cpus,
py::object tokenizer);
Vocab _build_vocab_from_text_file(const std::string &file_path,
const std::string &unk_token,
const int64_t min_freq,
const int64_t num_cpus,
py::object tokenizer);

} // namespace torchtext
14 changes: 7 additions & 7 deletions torchtext/experimental/vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
__all__ = [
'FastText',
'GloVe',
'vectors_from_file_object',
'vectors',
'load_vectors_from_file_path',
'build_vectors',
'Vectors'
]

Expand Down Expand Up @@ -153,8 +153,8 @@ def GloVe(name="840B", dim=300, unk_tensor=None, root=".data", validate_file=Tru
return vectors_obj


def vectors_from_file_object(file_like_object, delimiter=",", unk_tensor=None, num_cpus=10):
r"""Create a Vectors object from a csv file like object.
def load_vectors_from_file_path(filepath, delimiter=",", unk_tensor=None, num_cpus=10):
r"""Create a Vectors object from a csv file path.
Note that the tensor corresponding to each vector is of type `torch.float`.
Expand All @@ -165,7 +165,7 @@ def vectors_from_file_object(file_like_object, delimiter=",", unk_tensor=None, n
token_n<delimiter>num_m num_j num_k
Args:
file_like_object (FileObject): a file like object to read data from.
filepath: a file path to read data from.
delimiter (char): a character to delimit between the token and the vector. Default value is ","
unk_tensor (Tensor): a 1d tensor representing the vector associated with an unknown token.
num_cpus (int): the number of cpus to use when loading the vectors from file. Default: 10.
Expand All @@ -177,13 +177,13 @@ def vectors_from_file_object(file_like_object, delimiter=",", unk_tensor=None, n
ValueError: if duplicate tokens are found in FastText file.
"""
vectors_obj, dup_tokens = _load_token_and_vectors_from_file(file_like_object.name, delimiter, num_cpus, unk_tensor)
vectors_obj, dup_tokens = _load_token_and_vectors_from_file(filepath, delimiter, num_cpus, unk_tensor)
if dup_tokens:
raise ValueError("Found duplicate tokens in file: {}".format(str(dup_tokens)))
return Vectors(vectors_obj)


def vectors(tokens, vectors, unk_tensor=None):
def build_vectors(tokens, vectors, unk_tensor=None):
r"""Factory method for creating a vectors object which maps tokens to vectors.
Arguments:
tokens (List[str]): a list of tokens.
Expand Down
Loading

0 comments on commit 94ec092

Please sign in to comment.