Update the names of the experimental factory functions for vocab and …

…vectors (#1029)
pytorch · Nov 7, 2020 · 94ec092 · 94ec092
1 parent 88e55d7
commit 94ec092
Show file tree

Hide file tree

Showing 11 changed files with 70 additions and 72 deletions.
diff --git a/benchmark/benchmark_experimental_vocab.py b/benchmark/benchmark_experimental_vocab.py
@@ -6,8 +6,8 @@
 from torchtext.experimental.datasets import AG_NEWS
 from torchtext.experimental.vocab import (
     vocab as VocabExperimental,
-    vocab_from_file,
-    vocab_from_raw_text_file
+    load_vocab_from_file,
+    build_vocab_from_text_file
 )
 from torchtext.vocab import (
     Vocab,
@@ -68,11 +68,11 @@ def benchmark_experimental_vocab_construction(vocab_file_path, is_raw_text=True,
             for _ in range(num_iters):
                 tokenizer = basic_english_normalize()
                 jited_tokenizer = torch.jit.script(tokenizer.to_ivalue())
-                vocab_from_raw_text_file(f, jited_tokenizer, num_cpus=1)
+                build_vocab_from_text_file(f, jited_tokenizer, num_cpus=1)
             print("Construction time:", time.monotonic() - t0)
     else:
         for _ in range(num_iters):
-            vocab_from_file(f)
+            load_vocab_from_file(f)
         print("Construction time:", time.monotonic() - t0)
 
 
@@ -121,7 +121,7 @@ def token_iterator(file_path):
         print("Vocab Experimental")
         t0 = time.monotonic()
         f = open(vocab_file_path, 'r')
-        v_experimental = vocab_from_file(f)
+        v_experimental = load_vocab_from_file(f)
         print("Construction time:", time.monotonic() - t0)
     else:
         print("Loading Vocab from AG News")

diff --git a/docs/source/experimental_vectors.rst b/docs/source/experimental_vectors.rst
@@ -14,22 +14,22 @@ torchtext.experimental.vectors
     :members:
     :special-members:
 
-:hidden:`vectors`
-~~~~~~~~~~~~~~~~~
+:hidden:`build_vectors`
+~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: vectors
+.. autofunction:: build_vectors
 
-:hidden:`vectors_from_file_object`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+:hidden:`load_vectors_from_file_path`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: vectors_from_file_object
+.. autofunction:: load_vectors_from_file_path
 
 :hidden:`FastText`
-~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~
 
 .. autofunction:: FastText
 
 :hidden:`GloVe`
-~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~
 
 .. autofunction:: GloVe
diff --git a/docs/source/experimental_vocab.rst b/docs/source/experimental_vocab.rst
@@ -2,7 +2,7 @@
     :class: hidden-section
 
 torchtext.experimental.vocab
-==============================
+============================
 
 .. automodule:: torchtext.experimental.vocab
 .. currentmodule:: torchtext.experimental.vocab
@@ -19,15 +19,15 @@ torchtext.experimental.vocab
 
 .. autofunction:: vocab
 
-:hidden:`vocab_from_file`
-~~~~~~~~~~~~~~~~~~~~~~~~~
+:hidden:`load_vocab_from_file`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: vocab_from_file
+.. autofunction:: load_vocab_from_file
 
-:hidden:`vocab_from_raw_text_file`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+:hidden:`build_vocab_from_text_file`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: vocab_from_raw_text_file
+.. autofunction:: build_vocab_from_text_file
 
 :hidden:`build_vocab_from_iterator`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/examples/data_pipeline/pipelines.py b/examples/data_pipeline/pipelines.py
@@ -18,7 +18,7 @@
     sequential_transforms,
 )
 from torchtext.experimental.vectors import FastText as FastTextExperimental
-from torchtext.experimental.vocab import vocab_from_file
+from torchtext.experimental.vocab import load_vocab_from_file
 from torchtext.vocab import FastText
 
 import argparse
@@ -57,7 +57,7 @@ def token_iterator(vocab_file):
 def build_experimental_torchtext_pipeline(hf_vocab_file):
     tokenizer = basic_english_normalize()
     with open(hf_vocab_file, 'r') as f:
-        vocab = vocab_from_file(f)
+        vocab = load_vocab_from_file(f)
         pipeline = TextSequentialTransforms(tokenizer, vocab)
         jit_pipeline = torch.jit.script(pipeline.to_ivalue())
         print('jit experimental torchtext pipeline success!')

diff --git a/test/experimental/test_transforms_with_asset.py b/test/experimental/test_transforms_with_asset.py
@@ -10,17 +10,17 @@
     TextSequentialTransforms,
 )
 from torchtext.experimental.vocab import (
-    vocab_from_file,
-    vocab_from_raw_text_file,
+    load_vocab_from_file,
+    build_vocab_from_text_file,
 )
 import shutil
 import tempfile
 import os
 from torchtext.experimental.vectors import (
     GloVe,
-    vectors,
+    build_vectors,
     FastText,
-    vectors_from_file_object,
+    load_vectors_from_file_path,
 )
 from torchtext.utils import download_from_url
 
@@ -30,7 +30,7 @@ def test_vocab_transform(self):
         asset_name = 'vocab_test2.txt'
         asset_path = get_asset_path(asset_name)
         with open(asset_path, 'r') as f:
-            vocab_transform = VocabTransform(vocab_from_file(f))
+            vocab_transform = VocabTransform(load_vocab_from_file(f))
             self.assertEqual(vocab_transform(['of', 'that', 'new']),
                              [7, 18, 24])
             jit_vocab_transform = torch.jit.script(vocab_transform.to_ivalue())
@@ -44,15 +44,15 @@ def test_errors_vectors_python(self):
         with self.assertRaises(ValueError):
             # Test proper error raised when passing in empty tokens and vectors and
             # not passing in a user defined unk_tensor
-            vectors(tokens, vecs)
+            build_vectors(tokens, vecs)
 
         tensorA = torch.tensor([1, 0, 0], dtype=torch.int8)
         tokens = ['a']
         vecs = tensorA.unsqueeze(0)
 
         with self.assertRaises(TypeError):
             # Test proper error raised when vector is not of type torch.float
-            vectors(tokens, vecs)
+            build_vectors(tokens, vecs)
 
         with tempfile.TemporaryDirectory() as dir_name:
             # Test proper error raised when incorrect filename or dim passed into GloVe
@@ -131,7 +131,7 @@ def test_vocab_from_file(self):
         asset_name = 'vocab_test.txt'
         asset_path = get_asset_path(asset_name)
         with open(asset_path, 'r') as f:
-            v = vocab_from_file(f, unk_token='<new_unk>')
+            v = load_vocab_from_file(f, unk_token='<new_unk>')
             expected_itos = ['<new_unk>', 'b', 'a', 'c']
             expected_stoi = {x: index for index, x in enumerate(expected_itos)}
             self.assertEqual(v.get_itos(), expected_itos)
@@ -143,7 +143,7 @@ def test_vocab_from_raw_text_file(self):
         with open(asset_path, 'r') as f:
             tokenizer = basic_english_normalize()
             jit_tokenizer = torch.jit.script(tokenizer.to_ivalue())
-            v = vocab_from_raw_text_file(f, jit_tokenizer, unk_token='<new_unk>')
+            v = build_vocab_from_text_file(f, jit_tokenizer, unk_token='<new_unk>')
             expected_itos = ['<new_unk>', "'", 'after', 'talks', '.', 'are', 'at', 'disappointed',
                              'fears', 'federal', 'firm', 'for', 'mogul', 'n', 'newall', 'parent',
                              'pension', 'representing', 'say', 'stricken', 't', 'they', 'turner',
@@ -173,16 +173,15 @@ def test_text_sequential_transform(self):
         asset_name = 'vocab_test2.txt'
         asset_path = get_asset_path(asset_name)
         with open(asset_path, 'r') as f:
-            pipeline = TextSequentialTransforms(basic_english_normalize(), vocab_from_file(f))
+            pipeline = TextSequentialTransforms(basic_english_normalize(), load_vocab_from_file(f))
             jit_pipeline = torch.jit.script(pipeline.to_ivalue())
             self.assertEqual(pipeline('of that new'), [7, 18, 24])
             self.assertEqual(jit_pipeline('of that new'), [7, 18, 24])
 
     def test_vectors_from_file(self):
         asset_name = 'vectors_test.csv'
         asset_path = get_asset_path(asset_name)
-        f = open(asset_path, 'r')
-        vectors_obj = vectors_from_file_object(f)
+        vectors_obj = load_vectors_from_file_path(asset_path)
 
         expected_tensorA = torch.tensor([1, 0, 0], dtype=torch.float)
         expected_tensorB = torch.tensor([0, 1, 0], dtype=torch.float)

diff --git a/test/experimental/test_vectors.py b/test/experimental/test_vectors.py
@@ -5,7 +5,7 @@
 import unittest
 from test.common.torchtext_test_case import TorchtextTestCase
 from torchtext.experimental.vectors import (
-    vectors,
+    build_vectors,
 )
 
 
@@ -20,7 +20,7 @@ def test_empty_vectors(self):
         vecs = torch.empty(0, dtype=torch.float)
         unk_tensor = torch.tensor([0], dtype=torch.float)
 
-        vectors_obj = vectors(tokens, vecs, unk_tensor)
+        vectors_obj = build_vectors(tokens, vecs, unk_tensor)
         self.assertEqual(vectors_obj['not_in_it'], unk_tensor)
 
     def test_empty_unk(self):
@@ -29,7 +29,7 @@ def test_empty_unk(self):
 
         tokens = ['a']
         vecs = tensorA.unsqueeze(0)
-        vectors_obj = vectors(tokens, vecs)
+        vectors_obj = build_vectors(tokens, vecs)
 
         self.assertEqual(vectors_obj['not_in_it'], expected_unk_tensor)
 
@@ -40,7 +40,7 @@ def test_vectors_basic(self):
         unk_tensor = torch.tensor([0, 0], dtype=torch.float)
         tokens = ['a', 'b']
         vecs = torch.stack((tensorA, tensorB), 0)
-        vectors_obj = vectors(tokens, vecs, unk_tensor=unk_tensor)
+        vectors_obj = build_vectors(tokens, vecs, unk_tensor=unk_tensor)
 
         self.assertEqual(vectors_obj['a'], tensorA)
         self.assertEqual(vectors_obj['b'], tensorB)
@@ -53,7 +53,7 @@ def test_vectors_jit(self):
         unk_tensor = torch.tensor([0, 0], dtype=torch.float)
         tokens = ['a', 'b']
         vecs = torch.stack((tensorA, tensorB), 0)
-        vectors_obj = vectors(tokens, vecs, unk_tensor=unk_tensor)
+        vectors_obj = build_vectors(tokens, vecs, unk_tensor=unk_tensor)
         jit_vectors_obj = torch.jit.script(vectors_obj.to_ivalue())
 
         assert not vectors_obj.is_jitable
@@ -70,7 +70,7 @@ def test_vectors_forward(self):
         unk_tensor = torch.tensor([0, 0], dtype=torch.float)
         tokens = ['a', 'b']
         vecs = torch.stack((tensorA, tensorB), 0)
-        vectors_obj = vectors(tokens, vecs, unk_tensor=unk_tensor)
+        vectors_obj = build_vectors(tokens, vecs, unk_tensor=unk_tensor)
         jit_vectors_obj = torch.jit.script(vectors_obj.to_ivalue())
 
         tokens_to_lookup = ['a', 'b', 'c']
@@ -88,7 +88,7 @@ def test_vectors_lookup_vectors(self):
         unk_tensor = torch.tensor([0, 0], dtype=torch.float)
         tokens = ['a', 'b']
         vecs = torch.stack((tensorA, tensorB), 0)
-        vectors_obj = vectors(tokens, vecs, unk_tensor=unk_tensor)
+        vectors_obj = build_vectors(tokens, vecs, unk_tensor=unk_tensor)
 
         tokens_to_lookup = ['a', 'b', 'c']
         expected_vectors = torch.stack((tensorA, tensorB, unk_tensor), 0)
@@ -102,7 +102,7 @@ def test_vectors_add_item(self):
 
         tokens = ['a']
         vecs = tensorA.unsqueeze(0)
-        vectors_obj = vectors(tokens, vecs, unk_tensor=unk_tensor)
+        vectors_obj = build_vectors(tokens, vecs, unk_tensor=unk_tensor)
 
         tensorB = torch.tensor([0, 1], dtype=torch.float)
         vectors_obj['b'] = tensorB
@@ -118,7 +118,7 @@ def test_vectors_load_and_save(self):
 
         tokens = ['a', 'b']
         vecs = torch.stack((tensorA, tensorB), 0)
-        vectors_obj = vectors(tokens, vecs)
+        vectors_obj = build_vectors(tokens, vecs)
 
         tensorC = torch.tensor([1, 1], dtype=torch.float)
         vectors_obj['b'] = tensorC
@@ -145,4 +145,4 @@ def test_errors_vectors_cpp(self):
             # Test proper error raised when tokens have duplicates
             # TODO: use self.assertRaisesRegex() to check
             # the key of the duplicate token in the error message
-            vectors(tokens, vecs)
+            build_vectors(tokens, vecs)
diff --git a/torchtext/csrc/register_bindings.cpp b/torchtext/csrc/register_bindings.cpp
@@ -68,7 +68,7 @@ PYBIND11_MODULE(_torchtext, m) {
   m.def("_load_token_and_vectors_from_file",
         &_load_token_and_vectors_from_file);
   m.def("_load_vocab_from_file", &_load_vocab_from_file);
-  m.def("_load_vocab_from_raw_text_file", _load_vocab_from_raw_text_file);
+  m.def("_build_vocab_from_text_file", _build_vocab_from_text_file);
 }
 
 // Registers our custom classes with torch.

diff --git a/torchtext/csrc/vocab.cpp b/torchtext/csrc/vocab.cpp
@@ -330,10 +330,10 @@ Vocab _load_vocab_from_file(const std::string &file_path,
   return Vocab(std::move(tokens), std::move(stoi), unk_token, unk_index);
 }
 
-Vocab _load_vocab_from_raw_text_file(const std::string &file_path,
-                                     const std::string &unk_token,
-                                     const int64_t min_freq,
-                                     const int64_t num_cpus, py::object fn) {
+Vocab _build_vocab_from_text_file(const std::string &file_path,
+                                  const std::string &unk_token,
+                                  const int64_t min_freq,
+                                  const int64_t num_cpus, py::object fn) {
   std::cerr << "[INFO] Reading file " << file_path << std::endl;
 
   torch::jit::script::Module module(*torch::jit::as_module(fn));

diff --git a/torchtext/csrc/vocab.h b/torchtext/csrc/vocab.h
@@ -41,10 +41,10 @@ VocabStates _set_vocab_states(const c10::intrusive_ptr<Vocab> &self);
 Vocab _load_vocab_from_file(const std::string &file_path,
                             const std::string &unk_token,
                             const int64_t min_freq, const int64_t num_cpus);
-Vocab _load_vocab_from_raw_text_file(const std::string &file_path,
-                                     const std::string &unk_token,
-                                     const int64_t min_freq,
-                                     const int64_t num_cpus,
-                                     py::object tokenizer);
+Vocab _build_vocab_from_text_file(const std::string &file_path,
+                                  const std::string &unk_token,
+                                  const int64_t min_freq,
+                                  const int64_t num_cpus,
+                                  py::object tokenizer);
 
 } // namespace torchtext
diff --git a/torchtext/experimental/vectors.py b/torchtext/experimental/vectors.py
@@ -17,8 +17,8 @@
 __all__ = [
     'FastText',
     'GloVe',
-    'vectors_from_file_object',
-    'vectors',
+    'load_vectors_from_file_path',
+    'build_vectors',
     'Vectors'
 ]
 
@@ -153,8 +153,8 @@ def GloVe(name="840B", dim=300, unk_tensor=None, root=".data", validate_file=Tru
     return vectors_obj
 
 
-def vectors_from_file_object(file_like_object, delimiter=",", unk_tensor=None, num_cpus=10):
-    r"""Create a Vectors object from a csv file like object.
+def load_vectors_from_file_path(filepath, delimiter=",", unk_tensor=None, num_cpus=10):
+    r"""Create a Vectors object from a csv file path.
 
     Note that the tensor corresponding to each vector is of type `torch.float`.
 
@@ -165,7 +165,7 @@ def vectors_from_file_object(file_like_object, delimiter=",", unk_tensor=None, n
         token_n<delimiter>num_m num_j num_k
 
     Args:
-        file_like_object (FileObject): a file like object to read data from.
+        filepath: a file path to read data from.
         delimiter (char): a character to delimit between the token and the vector. Default value is ","
         unk_tensor (Tensor): a 1d tensor representing the vector associated with an unknown token.
         num_cpus (int): the number of cpus to use when loading the vectors from file. Default: 10.
@@ -177,13 +177,13 @@ def vectors_from_file_object(file_like_object, delimiter=",", unk_tensor=None, n
         ValueError: if duplicate tokens are found in FastText file.
 
     """
-    vectors_obj, dup_tokens = _load_token_and_vectors_from_file(file_like_object.name, delimiter, num_cpus, unk_tensor)
+    vectors_obj, dup_tokens = _load_token_and_vectors_from_file(filepath, delimiter, num_cpus, unk_tensor)
     if dup_tokens:
         raise ValueError("Found duplicate tokens in file: {}".format(str(dup_tokens)))
     return Vectors(vectors_obj)
 
 
-def vectors(tokens, vectors, unk_tensor=None):
+def build_vectors(tokens, vectors, unk_tensor=None):
     r"""Factory method for creating a vectors object which maps tokens to vectors.
     Arguments:
         tokens (List[str]): a list of tokens.