Skip to content

Commit

Permalink
Add Torchdata as a requirement and remove conditional imports of Torc…
Browse files Browse the repository at this point in the history
…hdata (#1961) (#1962)

* Add Torchdata as a requirement and remove conditional imports of Torchdata

* Add torchdata dep to meta.yaml
  • Loading branch information
abhinavarora committed Oct 26, 2022
1 parent e2b27f9 commit 771b5a2
Show file tree
Hide file tree
Showing 34 changed files with 88 additions and 151 deletions.
1 change: 1 addition & 0 deletions packaging/torchtext/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ requirements:
- python
- requests
- tqdm
- torchdata
{{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}

build:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def run(self):
description="Text utilities and datasets for PyTorch",
long_description=read("README.rst"),
license="BSD",
install_requires=["tqdm", "requests", pytorch_package_dep, "numpy"],
install_requires=["tqdm", "requests", pytorch_package_dep, "numpy", "torchdata"],
python_requires=">=3.7",
classifiers=[
"Programming Language :: Python :: 3.7",
Expand Down
5 changes: 1 addition & 4 deletions torchtext/_download_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,9 @@

# This is to allow monkey-patching in fbcode
from torch.hub import load_state_dict_from_url # noqa
from torchtext._internal.module_utils import is_module_available
from torchdata.datapipes.iter import HttpReader, GDriveReader # noqa F401
from tqdm import tqdm

if is_module_available("torchdata"):
from torchdata.datapipes.iter import HttpReader, GDriveReader # noqa F401


def _stream_response(r, chunk_size=16 * 1024):
total_size = int(r.headers.get("Content-length", 0))
Expand Down
6 changes: 2 additions & 4 deletions torchtext/datasets/ag_news.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,14 @@
from functools import partial
from typing import Union, Tuple

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_wrap_split_argument,
_create_dataset_directory,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader

URL = {
"train": "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv",
"test": "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv",
Expand Down
6 changes: 2 additions & 4 deletions torchtext/datasets/amazonreviewfull.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,14 @@
from functools import partial
from typing import Union, Tuple

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import GDriveReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_wrap_split_argument,
_create_dataset_directory,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import GDriveReader

URL = "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbZVhsUnRWRDhETzA"

MD5 = "57d28bd5d930e772930baddf36641c7c"
Expand Down
6 changes: 2 additions & 4 deletions torchtext/datasets/amazonreviewpolarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,14 @@
from functools import partial
from typing import Union, Tuple

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import GDriveReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_wrap_split_argument,
_create_dataset_directory,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import GDriveReader

URL = "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbaW12WVVZS2drcnM"

MD5 = "fe39f8b653cada45afd5792e0f0e8f9b"
Expand Down
7 changes: 2 additions & 5 deletions torchtext/datasets/cc100.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import os.path
from functools import partial

from torchtext._internal.module_utils import is_module_available
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader
from torchtext.data.datasets_utils import (
_create_dataset_directory,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader

URL = "http://data.statmt.org/cc-100/%s.txt.xz"

VALID_CODES = {
Expand Down
14 changes: 6 additions & 8 deletions torchtext/datasets/cnndm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,18 @@
from functools import partial
from typing import Union, Set, Tuple

from torchdata.datapipes.iter import (
FileOpener,
IterableWrapper,
OnlineReader,
GDriveReader,
)
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_wrap_split_argument,
_create_dataset_directory,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import (
FileOpener,
IterableWrapper,
OnlineReader,
GDriveReader,
)

DATASET_NAME = "CNNDM"

SPLIT_LIST = {
Expand Down
6 changes: 2 additions & 4 deletions torchtext/datasets/cola.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,11 @@
from functools import partial
from typing import Union, Tuple

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import _create_dataset_directory, _wrap_split_argument

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader

URL = "https://nyu-mll.github.io/CoLA/cola_public_1.1.zip"

MD5 = "9f6d88c3558ec424cd9d66ea03589aba"
Expand Down
6 changes: 2 additions & 4 deletions torchtext/datasets/conll2000chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,14 @@
from functools import partial
from typing import Union, Tuple

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_wrap_split_argument,
_create_dataset_directory,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader

URL = {
"train": "https://www.clips.uantwerpen.be/conll2000/chunking/train.txt.gz",
"test": "https://www.clips.uantwerpen.be/conll2000/chunking/test.txt.gz",
Expand Down
6 changes: 2 additions & 4 deletions torchtext/datasets/dbpedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,14 @@
from functools import partial
from typing import Union, Tuple

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import GDriveReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_wrap_split_argument,
_create_dataset_directory,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import GDriveReader

URL = "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k"

MD5 = "dca7b1ae12b1091090db52aa7ec5ca64"
Expand Down
6 changes: 2 additions & 4 deletions torchtext/datasets/enwik9.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import os
from functools import partial

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import _create_dataset_directory

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader

URL = "http://mattmahoney.net/dc/enwik9.zip"

MD5 = "3e773f8a1577fda2e27f871ca17f31fd"
Expand Down
6 changes: 2 additions & 4 deletions torchtext/datasets/imdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,12 @@
from pathlib import Path
from typing import Tuple, Union

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import _create_dataset_directory
from torchtext.data.datasets_utils import _wrap_split_argument

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader

URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

MD5 = "7c2ac02c03563afcf9b574c7e56c153a"
Expand Down
6 changes: 2 additions & 4 deletions torchtext/datasets/iwslt2016.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
from functools import partial

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import GDriveReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_clean_files,
Expand All @@ -9,10 +11,6 @@
_wrap_split_argument,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import GDriveReader

URL = "https://drive.google.com/uc?id=1l5y6Giag9aRPwGtuZHswh3w5v3qEz8D8"

_PATH = "2016-01.tgz"
Expand Down
6 changes: 2 additions & 4 deletions torchtext/datasets/iwslt2017.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
from functools import partial

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import GDriveReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_clean_files,
Expand All @@ -9,10 +11,6 @@
_wrap_split_argument,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import GDriveReader

URL = "https://drive.google.com/u/0/uc?id=12ycYSzLIG253AFN35Y6qoyf9wtkOjakp"
_PATH = "2017-01-trnmted.tgz"
MD5 = "aca701032b1c4411afc4d9fa367796ba"
Expand Down
12 changes: 5 additions & 7 deletions torchtext/datasets/mnli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,17 @@
import os
from functools import partial

from torchdata.datapipes.iter import FileOpener, IterableWrapper

# we import HttpReader from _download_hooks so we can swap out public URLs
# with interal URLs when the dataset is used within Facebook
from torchtext._download_hooks import HttpReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_create_dataset_directory,
_wrap_split_argument,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper

# we import HttpReader from _download_hooks so we can swap out public URLs
# with interal URLs when the dataset is used within Facebook
from torchtext._download_hooks import HttpReader


URL = "https://cims.nyu.edu/~sbowman/multinli/multinli_1.0.zip"

Expand Down
4 changes: 1 addition & 3 deletions torchtext/datasets/mrpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@
from functools import partial
from typing import Union, Tuple

from torchdata.datapipes.iter import FileOpener, HttpReader, IterableWrapper
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_wrap_split_argument,
_create_dataset_directory,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, HttpReader, IterableWrapper


URL = {
"train": "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt",
Expand Down
7 changes: 3 additions & 4 deletions torchtext/datasets/multi30k.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@
from functools import partial
from typing import Union, Tuple

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import GDriveReader # noqa
from torchtext._download_hooks import HttpReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_wrap_split_argument,
_create_dataset_directory,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader

URL = {
"train": "http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz",
"valid": "http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz",
Expand Down
7 changes: 3 additions & 4 deletions torchtext/datasets/penntreebank.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,15 @@
from functools import partial
from typing import Tuple, Union

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import GDriveReader # noqa
from torchtext._download_hooks import HttpReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_wrap_split_argument,
_create_dataset_directory,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader

URL = {
"train": "https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt",
"test": "https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt",
Expand Down
12 changes: 5 additions & 7 deletions torchtext/datasets/qnli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,17 @@
import os
from functools import partial

from torchdata.datapipes.iter import FileOpener, IterableWrapper

# we import HttpReader from _download_hooks so we can swap out public URLs
# with interal URLs when the dataset is used within Facebook
from torchtext._download_hooks import HttpReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import (
_create_dataset_directory,
_wrap_split_argument,
)

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper

# we import HttpReader from _download_hooks so we can swap out public URLs
# with interal URLs when the dataset is used within Facebook
from torchtext._download_hooks import HttpReader


URL = "https://dl.fbaipublicfiles.com/glue/data/QNLIv2.zip"

Expand Down
6 changes: 2 additions & 4 deletions torchtext/datasets/qqp.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import os
from functools import partial

from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader
from torchtext._internal.module_utils import is_module_available
from torchtext.data.datasets_utils import _create_dataset_directory

if is_module_available("torchdata"):
from torchdata.datapipes.iter import FileOpener, IterableWrapper
from torchtext._download_hooks import HttpReader

URL = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"

MD5 = "b6d5672bd9dc1e66ab2bb020ebeafb8d"
Expand Down
Loading

0 comments on commit 771b5a2

Please sign in to comment.