Merge branch 'master' of github.com:pytorch/text

pytorch · Sep 24, 2018 · 0697751 · 0697751
2 parents 6f0b3fa + 1b8ce74
commit 0697751
Show file tree

Hide file tree

Showing 9 changed files with 17 additions and 19 deletions.
diff --git a/README.rst b/README.rst
@@ -41,7 +41,7 @@ Alternatively, you might want to use Moses tokenizer from `NLTK <http://nltk.org
 Documentation
 =============
 
-Find the documentation `here <https://torchtext.readthedocs.io/en/latest/index.html>`.
+Find the documentation `here <https://torchtext.readthedocs.io/en/latest/index.html>`_.
 
 Data
 ====
@@ -118,9 +118,10 @@ The datasets module currently contains:
 * Sentiment analysis: SST and IMDb
 * Question classification: TREC
 * Entailment: SNLI, MultiNLI
-* Language modeling: abstract class + WikiText-2
+* Language modeling: abstract class + WikiText-2, WikiText103, PennTreebank
 * Machine translation: abstract class + Multi30k, IWSLT, WMT14
-* Sequence tagging (e.g. POS/NER): abstract class + UDPOS
+* Sequence tagging (e.g. POS/NER): abstract class + UDPOS, CoNLL2000Chunking
+* Question answering: 20 QA bAbI tasks
 
 Others are planned or a work in progress:
 

diff --git a/torchtext/datasets/__init__.py b/torchtext/datasets/__init__.py
@@ -2,7 +2,7 @@
 from .nli import SNLI, MultiNLI
 from .sst import SST
 from .translation import TranslationDataset, Multi30k, IWSLT, WMT14  # NOQA
-from .sequence_tagging import SequenceTaggingDataset, UDPOS, CoNLL2000Chunking # NOQA
+from .sequence_tagging import SequenceTaggingDataset, UDPOS, CoNLL2000Chunking  # NOQA
 from .trec import TREC
 from .imdb import IMDB
 from .babi import BABI20

diff --git a/torchtext/datasets/imdb.py b/torchtext/datasets/imdb.py
@@ -56,7 +56,7 @@ def splits(cls, text_field, label_field, root='.data',
 
     @classmethod
     def iters(cls, batch_size=32, device=0, root='.data', vectors=None, **kwargs):
-        """Creater iterator objects for splits of the IMDB dataset.
+        """Create iterator objects for splits of the IMDB dataset.
 
         Arguments:
             batch_size: Batch_size

diff --git a/torchtext/datasets/nli.py b/torchtext/datasets/nli.py
@@ -56,7 +56,7 @@ def splits(cls, text_field, label_field, parse_field=None,
             label_field: The field that will be used for label data.
             parse_field: The field that will be used for shift-reduce parser
                 transitions, or None to not include them.
-            extra_field: A dict[json_key: Tuple(field_name, Field)]
+            extra_fields: A dict[json_key: Tuple(field_name, Field)]
             root: The root directory that the dataset's zip archive will be
                 expanded into.
             train: The filename of the train data. Default: 'train.jsonl'.

diff --git a/torchtext/datasets/sst.py b/torchtext/datasets/sst.py
@@ -79,7 +79,7 @@ def splits(cls, text_field, label_field, root='.data',
 
     @classmethod
     def iters(cls, batch_size=32, device=0, root='.data', vectors=None, **kwargs):
-        """Creater iterator objects for splits of the SST dataset.
+        """Create iterator objects for splits of the SST dataset.
 
         Arguments:
             batch_size: Batch_size

diff --git a/torchtext/datasets/translation.py b/torchtext/datasets/translation.py
@@ -45,12 +45,12 @@ def splits(cls, exts, fields, path=None, root='.data',
         """Create dataset objects for splits of a TranslationDataset.
 
         Arguments:
-            path (str): Common prefix of the splits' file paths, or None to use
-                the result of cls.download(root).
-            root: Root dataset storage directory. Default is '.data'.
             exts: A tuple containing the extension to path for each language.
             fields: A tuple containing the fields that will be used for data
                 in each language.
+            path (str): Common prefix of the splits' file paths, or None to use
+                the result of cls.download(root).
+            root: Root dataset storage directory. Default is '.data'.
             train: The prefix of the train data. Default: 'train'.
             validation: The prefix of the validation data. Default: 'val'.
             test: The prefix of the test data. Default: 'test'.
@@ -86,11 +86,10 @@ def splits(cls, exts, fields, root='.data',
         """Create dataset objects for splits of the Multi30k dataset.
 
         Arguments:
-
-            root: Root dataset storage directory. Default is '.data'.
             exts: A tuple containing the extension to path for each language.
             fields: A tuple containing the fields that will be used for data
                 in each language.
+            root: Root dataset storage directory. Default is '.data'.
             train: The prefix of the train data. Default: 'train'.
             validation: The prefix of the validation data. Default: 'val'.
             test: The prefix of the test data. Default: 'test'.
@@ -127,11 +126,10 @@ def splits(cls, exts, fields, root='.data',
         """Create dataset objects for splits of the IWSLT dataset.
 
         Arguments:
-
-            root: Root dataset storage directory. Default is '.data'.
             exts: A tuple containing the extension to path for each language.
             fields: A tuple containing the fields that will be used for data
                 in each language.
+            root: Root dataset storage directory. Default is '.data'.
             train: The prefix of the train data. Default: 'train'.
             validation: The prefix of the validation data. Default: 'val'.
             test: The prefix of the test data. Default: 'test'.
@@ -202,12 +200,11 @@ def splits(cls, exts, fields, root='.data',
         """Create dataset objects for splits of the WMT 2014 dataset.
 
         Arguments:
-
-            root: Root dataset storage directory. Default is '.data'.
             exts: A tuple containing the extensions for each language. Must be
                 either ('.en', '.de') or the reverse.
             fields: A tuple containing the fields that will be used for data
                 in each language.
+            root: Root dataset storage directory. Default is '.data'.
             train: The prefix of the train data. Default:
                 'train.tok.clean.bpe.32000'.
             validation: The prefix of the validation data. Default:

diff --git a/torchtext/datasets/trec.py b/torchtext/datasets/trec.py
@@ -62,7 +62,7 @@ def splits(cls, text_field, label_field, root='.data',
 
     @classmethod
     def iters(cls, batch_size=32, device=0, root='.data', vectors=None, **kwargs):
-        """Creater iterator objects for splits of the TREC dataset.
+        """Create iterator objects for splits of the TREC dataset.
 
         Arguments:
             batch_size: Batch_size

diff --git a/torchtext/utils.py b/torchtext/utils.py
@@ -51,7 +51,7 @@ def download_from_url(url, path):
 
 def unicode_csv_reader(unicode_csv_data, **kwargs):
     """Since the standard csv library does not handle unicode in Python 2, we need a wrapper.
-    Borrwed and slightly modified from the Python docs:
+    Borrowed and slightly modified from the Python docs:
     https://docs.python.org/2/library/csv.html#csv-examples"""
     if six.PY2:
         # csv.py doesn't do Unicode; encode temporarily as UTF-8:

diff --git a/torchtext/vocab.py b/torchtext/vocab.py
@@ -254,7 +254,7 @@ def __init__(self, name, cache=None,
            name: name of the file that contains the vectors
            cache: directory for cached vectors
            url: url for download if vectors not found in cache
-           unk_init (callback): by default, initalize out-of-vocabulary word vectors
+           unk_init (callback): by default, initialize out-of-vocabulary word vectors
                to zero vectors; can be any function that takes in a Tensor and
                returns a Tensor of the same size
             max_vectors (int): this can be used to limit the number of