Skip to content

Commit

Permalink
fixing and re-organizing pipelines (#1250)
Browse files Browse the repository at this point in the history
  • Loading branch information
parmeet committed Mar 24, 2021
1 parent be3f640 commit eb5e39d
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 43 deletions.
12 changes: 6 additions & 6 deletions examples/data_pipeline/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Data processing pipelines with torchtext

This example shows a few data processing pipelines with the building blocks (like tokenizer, vocab). The raw text data from `torchtext.experimental.datasets.raw.text_classification` are used as the inputs for performance benchmark. We also enable the JIT support if possible.
This example shows a few data processing pipelines with the building blocks (like tokenizer, vocab). The raw text data from `torchtext.datasets` are used as the inputs for performance benchmark. We also enable the JIT support if possible.


## SentencePiece
Expand All @@ -24,7 +24,7 @@ This pipeline example shows the application with the existing `Vocab` in torchte

The command to run the pipeline:

python pipelines.py --pipeline torchtext
python pipelines.py --pipeline legacy_torchtext


## Experimental Torchtext
Expand All @@ -36,7 +36,7 @@ This pipeline example shows the application with the vocab text file from Huggin

The command to run the pipeline:

python pipelines.py --pipeline text_vocab
python pipelines.py --pipeline experimental_torchtext


## Legacy PyText
Expand Down Expand Up @@ -80,7 +80,7 @@ And the text and label pipeline are passed to TextClassificationPipeline. Since

The command to run the pipeline:

python pipelines.py --pipeline batch_torchtext
python pipelines.py --pipeline legacy_batch_torchtext


## Legacy FastText pretrained word vectors
Expand All @@ -92,7 +92,7 @@ This pipeline example shows the application with the pretained word vector from

The command to run the pipeline:

python pipelines.py --pipeline fasttext
python pipelines.py --pipeline legacy_fasttext


## Experimental FastText pretrained word vectors
Expand All @@ -104,7 +104,7 @@ This pipeline example shows the application with the pretained word vector using

The command to run the pipeline:

python pipelines.py --pipeline fasttext
python pipelines.py --pipeline experimental_fasttext

Here are the time in seconds for the pipelines above:

Expand Down
4 changes: 2 additions & 2 deletions examples/data_pipeline/dataset.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import torch
from torchtext.datasets import text_classification as raw
from torchtext.datasets import DATASETS


class BatchTextClassificationData(torch.utils.data.IterableDataset):

def __init__(self, dataset_name, batch_size=16):
super(BatchTextClassificationData, self).__init__()
self._iterator = raw.DATASETS[dataset_name]()[0] # Load train dataset only
self._iterator = DATASETS[dataset_name](split='train')
self.batch_size = batch_size

def __iter__(self):
Expand Down
74 changes: 39 additions & 35 deletions examples/data_pipeline/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
TextSequentialTransforms,
sentencepiece_tokenizer,
load_sp_model,
PRETRAINED_SP_MODEL,
)
from torchtext.data.utils import get_tokenizer
from torchtext.experimental.functional import (
Expand All @@ -20,14 +21,17 @@
from torchtext.experimental.vectors import FastText as FastTextExperimental
from torchtext.experimental.vocab import load_vocab_from_file
from torchtext.vocab import FastText

from torchtext.utils import download_from_url
import argparse
from torchtext.datasets import text_classification as raw
from torchtext.datasets import DATASETS
import time
from torch.utils.data import DataLoader


def build_sp_pipeline(spm_file):
def build_sp_pipeline(args):
spm_file = args.spm_filename
if spm_file in PRETRAINED_SP_MODEL:
spm_file = download_from_url(PRETRAINED_SP_MODEL[spm_file])
tokenizer = sentencepiece_tokenizer(spm_file)
vocab = PretrainedSPVocab(load_sp_model(spm_file))

Expand All @@ -38,7 +42,8 @@ def build_sp_pipeline(spm_file):
return pipeline, pipeline, jit_pipeline


def build_legacy_torchtext_vocab_pipeline(vocab_file):
def build_legacy_torchtext_vocab_pipeline(args):
vocab_file = args.vocab_filename
tokenizer = get_tokenizer("basic_english")
from torchtext.vocab import build_vocab_from_iterator

Expand All @@ -53,17 +58,19 @@ def token_iterator(vocab_file):
return pipeline, None, None


def build_experimental_torchtext_pipeline(hf_vocab_file):
def build_experimental_torchtext_pipeline(args):
vocab_file = args.vocab_filename
tokenizer = basic_english_normalize()
with open(hf_vocab_file, 'r') as f:
with open(vocab_file, 'r') as f:
vocab = load_vocab_from_file(f)
pipeline = TextSequentialTransforms(tokenizer, vocab)
jit_pipeline = torch.jit.script(pipeline)
print('jit experimental torchtext pipeline success!')
return pipeline, pipeline, jit_pipeline


def build_legacy_batch_torchtext_vocab_pipeline(vocab_file):
def build_legacy_batch_torchtext_vocab_pipeline(args):
vocab_file = args.vocab_filename
tokenizer = get_tokenizer("basic_english")
from torchtext.vocab import build_vocab_from_iterator

Expand All @@ -78,7 +85,8 @@ def token_iterator(vocab_file):
return text_pipeline, None, None


def build_legacy_pytext_vocab_pipeline(vocab_file):
def build_legacy_pytext_vocab_pipeline(args):
vocab_file = args.vocab_filename
from pytext.data.utils import Vocabulary

tokenizer = get_tokenizer("basic_english")
Expand All @@ -92,7 +100,8 @@ def build_legacy_pytext_vocab_pipeline(vocab_file):
return pipeline, None, None


def build_legacy_pytext_script_vocab_pipeline(vocab_file):
def build_legacy_pytext_script_vocab_pipeline(args):
vocab_file = args.vocab_filename
from pytext.torchscript.vocab import ScriptVocabulary

tokenizer = basic_english_normalize()
Expand All @@ -108,7 +117,8 @@ def build_legacy_pytext_script_vocab_pipeline(vocab_file):
return pipeline, pipeline, jit_pipeline


def build_experimental_pytext_script_pipeline(vocab_file):
def build_experimental_pytext_script_pipeline(args):
vocab_file = args.vocab_filename
import os
import sys
# this is needed because we want to add 'torchtext/examples/vocab' directory to the
Expand All @@ -129,15 +139,15 @@ def build_experimental_pytext_script_pipeline(vocab_file):
return pipeline, pipeline, jit_pipeline


def build_legacy_fasttext_vector_pipeline():
def build_legacy_fasttext_vector_pipeline(args):
tokenizer = get_tokenizer("basic_english")
vector = FastText()

pipeline = sequential_transforms(tokenizer, vector.get_vecs_by_tokens)
return pipeline, None, None


def build_experimental_fasttext_vector_pipeline():
def build_experimental_fasttext_vector_pipeline(args):
tokenizer = basic_english_normalize()
vector = FastTextExperimental()

Expand Down Expand Up @@ -166,44 +176,38 @@ def collate_fn(data_batch):


def generate_dataset(args):
train, test = raw.DATASETS[args.dataset]()
train, test = DATASETS[args.dataset]()
return [_data for _data in train], [_data for _data in test]


PIPELINES = {
'sentencepiece': build_sp_pipeline,
'experimental_torchtext': build_experimental_torchtext_pipeline,
'legacy_torchtext': build_legacy_torchtext_vocab_pipeline,
'experimental_fasttext': build_experimental_fasttext_vector_pipeline,
'legacy_fasttext': build_legacy_fasttext_vector_pipeline,
'experimental_pytext_script_vocab': build_experimental_pytext_script_pipeline,
'legacy_pytext_vocab': build_legacy_pytext_vocab_pipeline,
'legacy_pytext_script_vocab': build_legacy_pytext_script_vocab_pipeline,
'legacy_batch_torchtext': build_legacy_batch_torchtext_vocab_pipeline,
}

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Data procesing pipelines')
parser.add_argument('--pipeline', type=str, default='sentencepiece',
help='The name of pipeline')
parser.add_argument('--dataset', type=str, default='AG_NEWS',
help='Dataset for performance benchmark')
parser.add_argument('--spm-filename', type=str, default='m_user.model',
parser.add_argument('--spm-filename', type=str, default='text_unigram_25000',
help='The filename of sentencepiece model')
parser.add_argument('--vocab-filename', type=str, default='vocab.txt',
help='The name of vocab filename')
args = parser.parse_args()

if args.pipeline == 'sentencepiece':
pipeline, torchbind_pipeline, jit_pipeline = build_sp_pipeline(args.spm_filename)
elif args.pipeline == 'experimental_torchtext':
pipeline, torchbind_pipeline, jit_pipeline = build_experimental_torchtext_pipeline(args.vocab_filename)
elif args.pipeline == 'experimental_pytext_script_vocab':
pipeline, torchbind_pipeline, jit_pipeline = build_experimental_pytext_script_pipeline(args.vocab_filename)
elif args.pipeline == 'experimental_fasttext':
pipeline, torchbind_pipeline, jit_pipeline = build_experimental_fasttext_vector_pipeline()
elif args.pipeline == 'legacy_torchtext':
pipeline, torchbind_pipeline, jit_pipeline = build_legacy_torchtext_vocab_pipeline(args.vocab_filename)
elif args.pipeline == 'legacy_pytext_vocab':
pipeline, torchbind_pipeline, jit_pipeline = build_legacy_pytext_vocab_pipeline(args.vocab_filename)
elif args.pipeline == 'legacy_pytext_script_vocab':
pipeline, torchbind_pipeline, jit_pipeline = build_legacy_pytext_script_vocab_pipeline(args.vocab_filename)
elif args.pipeline == 'legacy_fasttext':
pipeline, torchbind_pipeline, jit_pipeline = build_legacy_fasttext_vector_pipeline()
elif args.pipeline == 'legacy_batch_torchtext':
pipeline, torchbind_pipeline, jit_pipeline = build_legacy_batch_torchtext_vocab_pipeline(args.vocab_filename)
else:
print("pipeline is not supported. Current pipelines include sentencepiece, experimental_torchtext, " +
"experimental_fasttext, legacy_pytext, experimental_fasttext, legacy_torchtext, legacy_batch_torchtext")
if args.pipeline not in PIPELINES:
raise KeyError('Pipeline {} is not supported. Valid pipelines are {}'.format(args.pipeline, list(PIPELINES.keys())))

pipeline, torchbind_pipeline, jit_pipeline = PIPELINES[args.pipeline](args)
if pipeline is not None:
print("Test eager mode for pipeline with pybind", args.pipeline)
train, test = generate_dataset(args)
Expand Down

0 comments on commit eb5e39d

Please sign in to comment.