fixing and re-organizing pipelines (#1250)

pytorch · Mar 24, 2021 · eb5e39d · eb5e39d
1 parent be3f640
commit eb5e39d
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 43 deletions.
diff --git a/examples/data_pipeline/README.md b/examples/data_pipeline/README.md
@@ -1,6 +1,6 @@
 # Data processing pipelines with torchtext
 
-This example shows a few data processing pipelines with the building blocks (like tokenizer, vocab). The raw text data from `torchtext.experimental.datasets.raw.text_classification` are used as the inputs for performance benchmark. We also enable the JIT support if possible.
+This example shows a few data processing pipelines with the building blocks (like tokenizer, vocab). The raw text data from `torchtext.datasets` are used as the inputs for performance benchmark. We also enable the JIT support if possible.
 
 
 ## SentencePiece 
@@ -24,7 +24,7 @@ This pipeline example shows the application with the existing `Vocab` in torchte
 
 The command to run the pipeline:
 
-    python pipelines.py --pipeline torchtext
+    python pipelines.py --pipeline legacy_torchtext
 
 
 ## Experimental Torchtext
@@ -36,7 +36,7 @@ This pipeline example shows the application with the vocab text file from Huggin
 
 The command to run the pipeline:
 
-    python pipelines.py --pipeline text_vocab 
+    python pipelines.py --pipeline experimental_torchtext 
 
 
 ## Legacy PyText
@@ -80,7 +80,7 @@ And the text and label pipeline are passed to TextClassificationPipeline. Since
 
 The command to run the pipeline:
 
-    python pipelines.py --pipeline batch_torchtext
+    python pipelines.py --pipeline legacy_batch_torchtext
 
 
 ## Legacy FastText pretrained word vectors 
@@ -92,7 +92,7 @@ This pipeline example shows the application with the pretained word vector from
 
 The command to run the pipeline:
 
-    python pipelines.py --pipeline fasttext 
+    python pipelines.py --pipeline legacy_fasttext 
 
 
 ## Experimental FastText pretrained word vectors 
@@ -104,7 +104,7 @@ This pipeline example shows the application with the pretained word vector using
 
 The command to run the pipeline:
 
-    python pipelines.py --pipeline fasttext 
+    python pipelines.py --pipeline experimental_fasttext 
 
 Here are the time in seconds for the pipelines above:
 

diff --git a/examples/data_pipeline/dataset.py b/examples/data_pipeline/dataset.py
@@ -1,12 +1,12 @@
 import torch
-from torchtext.datasets import text_classification as raw
+from torchtext.datasets import DATASETS
 
 
 class BatchTextClassificationData(torch.utils.data.IterableDataset):
 
     def __init__(self, dataset_name, batch_size=16):
         super(BatchTextClassificationData, self).__init__()
-        self._iterator = raw.DATASETS[dataset_name]()[0]  # Load train dataset only
+        self._iterator = DATASETS[dataset_name](split='train')
         self.batch_size = batch_size
 
     def __iter__(self):

diff --git a/examples/data_pipeline/pipelines.py b/examples/data_pipeline/pipelines.py
@@ -12,6 +12,7 @@
     TextSequentialTransforms,
     sentencepiece_tokenizer,
     load_sp_model,
+    PRETRAINED_SP_MODEL,
 )
 from torchtext.data.utils import get_tokenizer
 from torchtext.experimental.functional import (
@@ -20,14 +21,17 @@
 from torchtext.experimental.vectors import FastText as FastTextExperimental
 from torchtext.experimental.vocab import load_vocab_from_file
 from torchtext.vocab import FastText
-
+from torchtext.utils import download_from_url
 import argparse
-from torchtext.datasets import text_classification as raw
+from torchtext.datasets import DATASETS
 import time
 from torch.utils.data import DataLoader
 
 
-def build_sp_pipeline(spm_file):
+def build_sp_pipeline(args):
+    spm_file = args.spm_filename
+    if spm_file in PRETRAINED_SP_MODEL:
+        spm_file = download_from_url(PRETRAINED_SP_MODEL[spm_file])
     tokenizer = sentencepiece_tokenizer(spm_file)
     vocab = PretrainedSPVocab(load_sp_model(spm_file))
 
@@ -38,7 +42,8 @@ def build_sp_pipeline(spm_file):
     return pipeline, pipeline, jit_pipeline
 
 
-def build_legacy_torchtext_vocab_pipeline(vocab_file):
+def build_legacy_torchtext_vocab_pipeline(args):
+    vocab_file = args.vocab_filename
     tokenizer = get_tokenizer("basic_english")
     from torchtext.vocab import build_vocab_from_iterator
 
@@ -53,17 +58,19 @@ def token_iterator(vocab_file):
     return pipeline, None, None
 
 
-def build_experimental_torchtext_pipeline(hf_vocab_file):
+def build_experimental_torchtext_pipeline(args):
+    vocab_file = args.vocab_filename
     tokenizer = basic_english_normalize()
-    with open(hf_vocab_file, 'r') as f:
+    with open(vocab_file, 'r') as f:
         vocab = load_vocab_from_file(f)
         pipeline = TextSequentialTransforms(tokenizer, vocab)
         jit_pipeline = torch.jit.script(pipeline)
         print('jit experimental torchtext pipeline success!')
         return pipeline, pipeline, jit_pipeline
 
 
-def build_legacy_batch_torchtext_vocab_pipeline(vocab_file):
+def build_legacy_batch_torchtext_vocab_pipeline(args):
+    vocab_file = args.vocab_filename
     tokenizer = get_tokenizer("basic_english")
     from torchtext.vocab import build_vocab_from_iterator
 
@@ -78,7 +85,8 @@ def token_iterator(vocab_file):
     return text_pipeline, None, None
 
 
-def build_legacy_pytext_vocab_pipeline(vocab_file):
+def build_legacy_pytext_vocab_pipeline(args):
+    vocab_file = args.vocab_filename
     from pytext.data.utils import Vocabulary
 
     tokenizer = get_tokenizer("basic_english")
@@ -92,7 +100,8 @@ def build_legacy_pytext_vocab_pipeline(vocab_file):
         return pipeline, None, None
 
 
-def build_legacy_pytext_script_vocab_pipeline(vocab_file):
+def build_legacy_pytext_script_vocab_pipeline(args):
+    vocab_file = args.vocab_filename
     from pytext.torchscript.vocab import ScriptVocabulary
 
     tokenizer = basic_english_normalize()
@@ -108,7 +117,8 @@ def build_legacy_pytext_script_vocab_pipeline(vocab_file):
         return pipeline, pipeline, jit_pipeline
 
 
-def build_experimental_pytext_script_pipeline(vocab_file):
+def build_experimental_pytext_script_pipeline(args):
+    vocab_file = args.vocab_filename
     import os
     import sys
     # this is needed because we want to add 'torchtext/examples/vocab' directory to the
@@ -129,15 +139,15 @@ def build_experimental_pytext_script_pipeline(vocab_file):
     return pipeline, pipeline, jit_pipeline
 
 
-def build_legacy_fasttext_vector_pipeline():
+def build_legacy_fasttext_vector_pipeline(args):
     tokenizer = get_tokenizer("basic_english")
     vector = FastText()
 
     pipeline = sequential_transforms(tokenizer, vector.get_vecs_by_tokens)
     return pipeline, None, None
 
 
-def build_experimental_fasttext_vector_pipeline():
+def build_experimental_fasttext_vector_pipeline(args):
     tokenizer = basic_english_normalize()
     vector = FastTextExperimental()
 
@@ -166,44 +176,38 @@ def collate_fn(data_batch):
 
 
 def generate_dataset(args):
-    train, test = raw.DATASETS[args.dataset]()
+    train, test = DATASETS[args.dataset]()
     return [_data for _data in train], [_data for _data in test]
 
 
+PIPELINES = {
+    'sentencepiece': build_sp_pipeline,
+    'experimental_torchtext': build_experimental_torchtext_pipeline,
+    'legacy_torchtext': build_legacy_torchtext_vocab_pipeline,
+    'experimental_fasttext': build_experimental_fasttext_vector_pipeline,
+    'legacy_fasttext': build_legacy_fasttext_vector_pipeline,
+    'experimental_pytext_script_vocab': build_experimental_pytext_script_pipeline,
+    'legacy_pytext_vocab': build_legacy_pytext_vocab_pipeline,
+    'legacy_pytext_script_vocab': build_legacy_pytext_script_vocab_pipeline,
+    'legacy_batch_torchtext': build_legacy_batch_torchtext_vocab_pipeline,
+}
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Data procesing pipelines')
     parser.add_argument('--pipeline', type=str, default='sentencepiece',
                         help='The name of pipeline')
     parser.add_argument('--dataset', type=str, default='AG_NEWS',
                         help='Dataset for performance benchmark')
-    parser.add_argument('--spm-filename', type=str, default='m_user.model',
+    parser.add_argument('--spm-filename', type=str, default='text_unigram_25000',
                         help='The filename of sentencepiece model')
     parser.add_argument('--vocab-filename', type=str, default='vocab.txt',
                         help='The name of vocab filename')
     args = parser.parse_args()
 
-    if args.pipeline == 'sentencepiece':
-        pipeline, torchbind_pipeline, jit_pipeline = build_sp_pipeline(args.spm_filename)
-    elif args.pipeline == 'experimental_torchtext':
-        pipeline, torchbind_pipeline, jit_pipeline = build_experimental_torchtext_pipeline(args.vocab_filename)
-    elif args.pipeline == 'experimental_pytext_script_vocab':
-        pipeline, torchbind_pipeline, jit_pipeline = build_experimental_pytext_script_pipeline(args.vocab_filename)
-    elif args.pipeline == 'experimental_fasttext':
-        pipeline, torchbind_pipeline, jit_pipeline = build_experimental_fasttext_vector_pipeline()
-    elif args.pipeline == 'legacy_torchtext':
-        pipeline, torchbind_pipeline, jit_pipeline = build_legacy_torchtext_vocab_pipeline(args.vocab_filename)
-    elif args.pipeline == 'legacy_pytext_vocab':
-        pipeline, torchbind_pipeline, jit_pipeline = build_legacy_pytext_vocab_pipeline(args.vocab_filename)
-    elif args.pipeline == 'legacy_pytext_script_vocab':
-        pipeline, torchbind_pipeline, jit_pipeline = build_legacy_pytext_script_vocab_pipeline(args.vocab_filename)
-    elif args.pipeline == 'legacy_fasttext':
-        pipeline, torchbind_pipeline, jit_pipeline = build_legacy_fasttext_vector_pipeline()
-    elif args.pipeline == 'legacy_batch_torchtext':
-        pipeline, torchbind_pipeline, jit_pipeline = build_legacy_batch_torchtext_vocab_pipeline(args.vocab_filename)
-    else:
-        print("pipeline is not supported. Current pipelines include sentencepiece, experimental_torchtext, " +
-              "experimental_fasttext, legacy_pytext, experimental_fasttext, legacy_torchtext, legacy_batch_torchtext")
+    if args.pipeline not in PIPELINES:
+        raise KeyError('Pipeline {} is not supported. Valid pipelines are {}'.format(args.pipeline, list(PIPELINES.keys())))
 
+    pipeline, torchbind_pipeline, jit_pipeline = PIPELINES[args.pipeline](args)
     if pipeline is not None:
         print("Test eager mode for pipeline with pybind", args.pipeline)
         train, test = generate_dataset(args)