Merge branch 'master' into new_supervised_learning_dataset

pytorch · Jul 23, 2019 · c8f1e9d · c8f1e9d
2 parents 81e5a31 + 1ebee35
commit c8f1e9d
Show file tree

Hide file tree

Showing 8 changed files with 179 additions and 3 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,42 @@
+## 🐛 Bug
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Environment**
+
+Please copy and paste the output from our
+[environment collection script](https://raw.githubusercontent.com/pytorch/text/master/torchtext/utils/collect_env.py)
+(or fill out the checklist below manually).
+
+You can get the script and run it with:
+```
+wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
+# For security purposes, please check the contents of collect_env.py before running it.
+python collect_env.py
+python -c "import torchtext; print(\"torchtext version is \", torchtext.__version__)"
+```
+
+ - PyTorch Version (e.g., 1.0):
+ - OS (e.g., Linux):
+ - How you installed PyTorch (`conda`, `pip`, source):
+ - Build command you used (if compiling from source):
+ - Python version:
+ - CUDA/cuDNN version:
+ - GPU models and configuration:
+ - Any other relevant information:
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
@@ -0,0 +1,4 @@
+## 📚 Documentation
+
+**Description**
+<!-- A clear and concise description of what content in https://torchtext.readthedocs.io/en/latest/ is an issue. -->
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,18 @@
+## 🚀 Feature
+<!-- A clear and concise description of the feature proposal -->
+
+**Motivation**
+
+<!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too -->
+
+**Pitch**
+
+<!-- A clear and concise description of what you want to happen. -->
+
+**Alternatives**
+
+<!-- A clear and concise description of any alternative solutions or features you've considered, if any. -->
+
+**Additional context**
+
+<!-- Add any other context or screenshots about the feature request here. -->
diff --git a/.github/ISSUE_TEMPLATE/questions_help_support.md b/.github/ISSUE_TEMPLATE/questions_help_support.md
@@ -0,0 +1,4 @@
+## ❓ Questions and Help
+
+**Description**
+<!-- Please send questions or ask for help here. -->
diff --git a/torchtext/data/dataset.py b/torchtext/data/dataset.py
@@ -323,8 +323,22 @@ def stratify(examples, strata_field):
 
 
 def rationed_split(examples, train_ratio, test_ratio, val_ratio, rnd):
-    # Create a random permutation of examples, then split them
-    # by ratio x length slices for each of the train/test/dev? splits
+    """Create a random permutation of examples, then split them by ratios
+
+    Arguments:
+        examples: a list of data
+        train_ratio, test_ratio, val_ratio: split fractions.
+        rnd: a random shuffler
+
+    Examples:
+        >>> examples = []
+        >>> train_ratio, test_ratio, val_ratio = 0.7, 0.2, 0.1
+        >>> rnd = torchtext.data.dataset.RandomShuffler(None)
+        >>> train_examples, test_examples, valid_examples = \
+                torchtext.data.dataset.rationed_split(examples, train_ratio,
+                                                      test_ratio, val_ratio,
+                                                      rnd)
+    """
     N = len(examples)
     randperm = rnd(range(N))
     train_len = int(round(train_ratio * N))

diff --git a/torchtext/data/utils.py b/torchtext/data/utils.py
@@ -114,6 +114,29 @@ def dtype_to_attr(dtype):
     return dtype
 
 
+def generate_ngrams(token_list, ngrams):
+    """Generate a list of token up to ngrams.
+
+    Arguments:
+        token_list: A list of tokens
+        ngrams: the number of ngrams.
+
+    Examples:
+        >>> token_list = ['here', 'we', 'are']
+        >>> torchtext.data.utils.generate_ngrams(token_list, 2)
+        >>> ['here', 'here we', 'we', 'we are', 'are']
+    """
+
+    re_list = []
+    for i in range(0, len(token_list)):
+        x = token_list[i]
+        re_list.append(x)
+        for j in range(i + 1, min(i + ngrams, len(token_list))):
+            x += ' ' + token_list[j]
+            re_list.append(x)
+    return re_list
+
+
 class RandomShuffler(object):
     """Use random functions while keeping track of the random state to make it
     reproducible and deterministic."""

diff --git a/torchtext/utils.py b/torchtext/utils.py
@@ -2,6 +2,8 @@
 import requests
 import csv
 from tqdm import tqdm
+import os
+import tarfile
 
 
 def reporthook(t):
@@ -25,7 +27,18 @@ def inner(b=1, bsize=1, tsize=None):
 
 
 def download_from_url(url, path):
-    """Download file, with logic (from tensor2tensor) for Google Drive"""
+    """Download file, with logic (from tensor2tensor) for Google Drive
+
+    Arguments:
+        url: the url for online Dataset
+        path: directory and filename for the downloaded dataset.
+
+    Examples:
+        >>> url = 'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz'
+        >>> path = './validation.tar.gz'
+        >>> torchtext.utils.download_from_url(url, path)
+    """
+
     def process_response(r):
         chunk_size = 16 * 1024
         total_size = int(r.headers.get('Content-length', 0))
@@ -75,3 +88,31 @@ def unicode_csv_reader(unicode_csv_data, **kwargs):
 def utf_8_encoder(unicode_csv_data):
     for line in unicode_csv_data:
         yield line.encode('utf-8')
+
+
+def extract_archive(from_path, to_path=None, remove_finished=False):
+    """Extract tar.gz archives.
+
+    Arguments:
+        from_path: the path where the tar.gz file is.
+        to_path: the path where the extracted files are.
+        remove_finished: remove the original tar.gz file. Default: False
+
+    Examples:
+        >>> url = 'http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz'
+        >>> from_path = './validation.tar.gz'
+        >>> to_path = './'
+        >>> torchtext.utils.download_from_url(url, from_path)
+        >>> torchtext.utils.extract_archive(from_path, to_path)
+    """
+    if to_path is None:
+        to_path = os.path.dirname(from_path)
+
+    if from_path.endswith(".tar.gz"):
+        with tarfile.open(from_path, 'r:gz') as tar:
+            tar.extractall(path=to_path)
+    else:
+        raise ValueError("Extraction of {} not supported".format(from_path))
+
+    if remove_finished:
+        os.remove(from_path)
diff --git a/torchtext/vocab.py b/torchtext/vocab.py
@@ -13,6 +13,8 @@
 import tarfile
 
 from .utils import reporthook
+from collections import Counter, OrderedDict
+from itertools import chain
 
 logger = logging.getLogger(__name__)
 
@@ -216,6 +218,34 @@ def set_vectors(self, stoi, vectors, dim, unk_init=torch.Tensor.zero_):
                 self.vectors[i] = unk_init(self.vectors[i])
 
 
+def build_dictionary(dataset, field, data_name, **kwargs):
+    """Construct the Vocab object for the field from a dataset.
+
+    Arguments:
+        dataset: Dataset with the iterable data.
+        field: Field object with the information of the special tokens.
+        data_name: The names of data used to build vocab (e.g. 'text', 'label').
+            It must be the attributes of dataset's examples.
+        Remaining keyword arguments: Passed to the constructor of Vocab.
+
+    Examples:
+        >>> field.vocab = build_vocab(dataset, field, 'text')
+    """
+    counter = Counter()
+    for x in dataset:
+        x = getattr(x, data_name)
+        if not field.sequential:
+            x = [x]
+        try:
+            counter.update(x)
+        except TypeError:
+            counter.update(chain.from_iterable(x.text))
+    specials = list(OrderedDict.fromkeys(
+        tok for tok in [field.unk_token, field.pad_token, field.init_token,
+                        field.eos_token] if tok is not None))
+    return Vocab(counter, specials=specials, **kwargs)
+
+
 class SubwordVocab(Vocab):
 
     def __init__(self, counter, max_size=None, specials=['<pad>'],