Skip to content

Commit

Permalink
Merge branch 'master' of github.com:pytorch/text
Browse files Browse the repository at this point in the history
  • Loading branch information
nzw0301 committed Sep 24, 2018
2 parents 6f0b3fa + 1b8ce74 commit 0697751
Show file tree
Hide file tree
Showing 9 changed files with 17 additions and 19 deletions.
7 changes: 4 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ Alternatively, you might want to use Moses tokenizer from `NLTK <http://nltk.org
Documentation
=============

Find the documentation `here <https://torchtext.readthedocs.io/en/latest/index.html>`.
Find the documentation `here <https://torchtext.readthedocs.io/en/latest/index.html>`_.

Data
====
Expand Down Expand Up @@ -118,9 +118,10 @@ The datasets module currently contains:
* Sentiment analysis: SST and IMDb
* Question classification: TREC
* Entailment: SNLI, MultiNLI
* Language modeling: abstract class + WikiText-2
* Language modeling: abstract class + WikiText-2, WikiText103, PennTreebank
* Machine translation: abstract class + Multi30k, IWSLT, WMT14
* Sequence tagging (e.g. POS/NER): abstract class + UDPOS
* Sequence tagging (e.g. POS/NER): abstract class + UDPOS, CoNLL2000Chunking
* Question answering: 20 QA bAbI tasks

Others are planned or a work in progress:

Expand Down
2 changes: 1 addition & 1 deletion torchtext/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from .nli import SNLI, MultiNLI
from .sst import SST
from .translation import TranslationDataset, Multi30k, IWSLT, WMT14 # NOQA
from .sequence_tagging import SequenceTaggingDataset, UDPOS, CoNLL2000Chunking # NOQA
from .sequence_tagging import SequenceTaggingDataset, UDPOS, CoNLL2000Chunking # NOQA
from .trec import TREC
from .imdb import IMDB
from .babi import BABI20
Expand Down
2 changes: 1 addition & 1 deletion torchtext/datasets/imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def splits(cls, text_field, label_field, root='.data',

@classmethod
def iters(cls, batch_size=32, device=0, root='.data', vectors=None, **kwargs):
"""Creater iterator objects for splits of the IMDB dataset.
"""Create iterator objects for splits of the IMDB dataset.
Arguments:
batch_size: Batch_size
Expand Down
2 changes: 1 addition & 1 deletion torchtext/datasets/nli.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def splits(cls, text_field, label_field, parse_field=None,
label_field: The field that will be used for label data.
parse_field: The field that will be used for shift-reduce parser
transitions, or None to not include them.
extra_field: A dict[json_key: Tuple(field_name, Field)]
extra_fields: A dict[json_key: Tuple(field_name, Field)]
root: The root directory that the dataset's zip archive will be
expanded into.
train: The filename of the train data. Default: 'train.jsonl'.
Expand Down
2 changes: 1 addition & 1 deletion torchtext/datasets/sst.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def splits(cls, text_field, label_field, root='.data',

@classmethod
def iters(cls, batch_size=32, device=0, root='.data', vectors=None, **kwargs):
"""Creater iterator objects for splits of the SST dataset.
"""Create iterator objects for splits of the SST dataset.
Arguments:
batch_size: Batch_size
Expand Down
15 changes: 6 additions & 9 deletions torchtext/datasets/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,12 @@ def splits(cls, exts, fields, path=None, root='.data',
"""Create dataset objects for splits of a TranslationDataset.
Arguments:
path (str): Common prefix of the splits' file paths, or None to use
the result of cls.download(root).
root: Root dataset storage directory. Default is '.data'.
exts: A tuple containing the extension to path for each language.
fields: A tuple containing the fields that will be used for data
in each language.
path (str): Common prefix of the splits' file paths, or None to use
the result of cls.download(root).
root: Root dataset storage directory. Default is '.data'.
train: The prefix of the train data. Default: 'train'.
validation: The prefix of the validation data. Default: 'val'.
test: The prefix of the test data. Default: 'test'.
Expand Down Expand Up @@ -86,11 +86,10 @@ def splits(cls, exts, fields, root='.data',
"""Create dataset objects for splits of the Multi30k dataset.
Arguments:
root: Root dataset storage directory. Default is '.data'.
exts: A tuple containing the extension to path for each language.
fields: A tuple containing the fields that will be used for data
in each language.
root: Root dataset storage directory. Default is '.data'.
train: The prefix of the train data. Default: 'train'.
validation: The prefix of the validation data. Default: 'val'.
test: The prefix of the test data. Default: 'test'.
Expand Down Expand Up @@ -127,11 +126,10 @@ def splits(cls, exts, fields, root='.data',
"""Create dataset objects for splits of the IWSLT dataset.
Arguments:
root: Root dataset storage directory. Default is '.data'.
exts: A tuple containing the extension to path for each language.
fields: A tuple containing the fields that will be used for data
in each language.
root: Root dataset storage directory. Default is '.data'.
train: The prefix of the train data. Default: 'train'.
validation: The prefix of the validation data. Default: 'val'.
test: The prefix of the test data. Default: 'test'.
Expand Down Expand Up @@ -202,12 +200,11 @@ def splits(cls, exts, fields, root='.data',
"""Create dataset objects for splits of the WMT 2014 dataset.
Arguments:
root: Root dataset storage directory. Default is '.data'.
exts: A tuple containing the extensions for each language. Must be
either ('.en', '.de') or the reverse.
fields: A tuple containing the fields that will be used for data
in each language.
root: Root dataset storage directory. Default is '.data'.
train: The prefix of the train data. Default:
'train.tok.clean.bpe.32000'.
validation: The prefix of the validation data. Default:
Expand Down
2 changes: 1 addition & 1 deletion torchtext/datasets/trec.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def splits(cls, text_field, label_field, root='.data',

@classmethod
def iters(cls, batch_size=32, device=0, root='.data', vectors=None, **kwargs):
"""Creater iterator objects for splits of the TREC dataset.
"""Create iterator objects for splits of the TREC dataset.
Arguments:
batch_size: Batch_size
Expand Down
2 changes: 1 addition & 1 deletion torchtext/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def download_from_url(url, path):

def unicode_csv_reader(unicode_csv_data, **kwargs):
"""Since the standard csv library does not handle unicode in Python 2, we need a wrapper.
Borrwed and slightly modified from the Python docs:
Borrowed and slightly modified from the Python docs:
https://docs.python.org/2/library/csv.html#csv-examples"""
if six.PY2:
# csv.py doesn't do Unicode; encode temporarily as UTF-8:
Expand Down
2 changes: 1 addition & 1 deletion torchtext/vocab.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def __init__(self, name, cache=None,
name: name of the file that contains the vectors
cache: directory for cached vectors
url: url for download if vectors not found in cache
unk_init (callback): by default, initalize out-of-vocabulary word vectors
unk_init (callback): by default, initialize out-of-vocabulary word vectors
to zero vectors; can be any function that takes in a Tensor and
returns a Tensor of the same size
max_vectors (int): this can be used to limit the number of
Expand Down

0 comments on commit 0697751

Please sign in to comment.