Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions test/torchaudio_unittest/datasets/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,15 @@ class TestIterator(TorchaudioTestCase):
path = get_asset_path()

def test_disckcache_iterator(self):
data = COMMONVOICE(self.path, url="tatar")
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar")
data = dataset_utils.diskcache_iterator(data)
# Save
data[0]
# Load
data[0]

def test_bg_iterator(self):
data = COMMONVOICE(self.path, url="tatar")
data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar")
data = dataset_utils.bg_iterator(data, 5)
for _ in data:
pass
177 changes: 95 additions & 82 deletions torchaudio/datasets/commonvoice.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os
from typing import List, Dict, Tuple
import warnings
from typing import List, Dict, Tuple, Optional

import torchaudio
from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader
from torchaudio.datasets.utils import extract_archive, unicode_csv_reader, validate_file
from torch import Tensor
from torch.utils.data import Dataset

Expand All @@ -15,68 +16,39 @@
# validated.tsv

FOLDER_IN_ARCHIVE = "CommonVoice"
URL = "english"
VERSION = "cv-corpus-4-2019-12-10"
LANGUAGE = "english"
VERSION = "cv-corpus-5.1-2020-06-22"
TSV = "train.tsv"
_CHECKSUMS = {
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz":
None
"cv-corpus-5.1-2020-06-22/tt.tar.gz": None,
"cv-corpus-5.1-2020-06-22/en.tar.gz": None,
"cv-corpus-5.1-2020-06-22/de.tar.gz": None,
"cv-corpus-5.1-2020-06-22/fr.tar.gz": None,
"cv-corpus-5.1-2020-06-22/cy.tar.gz": None,
"cv-corpus-5.1-2020-06-22/br.tar.gz": None,
"cv-corpus-5.1-2020-06-22/cv.tar.gz": None,
"cv-corpus-5.1-2020-06-22/tr.tar.gz": None,
"cv-corpus-5.1-2020-06-22/ky.tar.gz": None,
"cv-corpus-5.1-2020-06-22/ga-IE.tar.gz": None,
"cv-corpus-5.1-2020-06-22/kab.tar.gz": None,
"cv-corpus-5.1-2020-06-22/ca.tar.gz": None,
"cv-corpus-5.1-2020-06-22/zh-TW.tar.gz": None,
"cv-corpus-5.1-2020-06-22/sl.tar.gz": None,
"cv-corpus-5.1-2020-06-22/it.tar.gz": None,
"cv-corpus-5.1-2020-06-22/nl.tar.gz": None,
"cv-corpus-5.1-2020-06-22/cnh.tar.gz": None,
"cv-corpus-5.1-2020-06-22/eo.tar.gz": None,
"cv-corpus-5.1-2020-06-22/et.tar.gz": None,
"cv-corpus-5.1-2020-06-22/fa.tar.gz": None,
"cv-corpus-5.1-2020-06-22/eu.tar.gz": None,
"cv-corpus-5.1-2020-06-22/es.tar.gz": None,
"cv-corpus-5.1-2020-06-22/zh-CN.tar.gz": None,
"cv-corpus-5.1-2020-06-22/mn.tar.gz": None,
"cv-corpus-5.1-2020-06-22/sah.tar.gz": None,
"cv-corpus-5.1-2020-06-22/dv.tar.gz": None,
"cv-corpus-5.1-2020-06-22/rw.tar.gz": None,
"cv-corpus-5.1-2020-06-22/sv-SE.tar.gz": None,
"cv-corpus-5.1-2020-06-22/ru.tar.gz": None,
}


Expand All @@ -100,15 +72,18 @@ def load_commonvoice_item(line: List[str],


class COMMONVOICE(Dataset):
"""Create a Dataset for CommonVoice.
"""Create a Dataset for `CommonVoice <https://commonvoice.mozilla.org/>`_.

Args:
root (str): Path to the directory where the dataset is found or downloaded.
tsv (str, optional): The name of the tsv file used to construct the metadata.
(default: ``"train.tsv"``)
url (str, optional): The URL to download the dataset from, or the language of
the dataset to download. (default: ``"english"``).
Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
url (str, optional): Deprecated.
folder_in_archive (str, optional): The top-level directory of the dataset.
version (str): Version string. (default: ``"cv-corpus-5.1-2020-06-22"``)
language (str, optional): Language of the dataset. (default: None)
The following values are mapped to their corresponding shortened version:
``"tatar"``, ``"english"``, ``"german"``,
``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,
Expand All @@ -117,11 +92,8 @@ class COMMONVOICE(Dataset):
``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
``"romansh sursilvan"``.
folder_in_archive (str, optional): The top-level directory of the dataset.
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
download (bool, optional):
Whether to download the dataset if it is not found at root path. (default: ``False``).
download (bool, optional): Deprecated.
"""

_ext_txt = ".txt"
Expand All @@ -131,10 +103,30 @@ class COMMONVOICE(Dataset):
def __init__(self,
root: str,
tsv: str = TSV,
url: str = URL,
url: Optional[str] = None,
folder_in_archive: str = FOLDER_IN_ARCHIVE,
version: str = VERSION,
download: bool = False) -> None:
language: str = LANGUAGE,
download: Optional[bool] = False) -> None:

if download is True:
raise RuntimeError(
"The dataset is no longer publicly accessible. You need to "
"download the archives externally and place them in the root "
"directory."
)
elif download is False:
warnings.warn(
"The use of the download flag is deprecated, since the dataset "
"is no longer directly accessible.", RuntimeWarning
)

if url is not None:
warnings.warn(
"The use of the url flag is deprecated, since the dataset "
"is no longer publicly accessible. To specify the language of the dataset, "
"please use the language parameter instead.", RuntimeWarning
)

languages = {
"tatar": "tt",
Expand Down Expand Up @@ -179,12 +171,22 @@ def __init__(self,
"romansh sursilvan": "rm-sursilv"
}

if url in languages:
if language in languages:
ext_archive = ".tar.gz"
language = languages[url]

base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
url = os.path.join(base_url, version, language + ext_archive)
language = languages[language]
url = os.path.join(version, language + ext_archive)
else:
raise ValueError(
'Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,'
'``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,'
'``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,'
'``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,'
'``"persian"``, ``"portuguese"``, ``"basque"``, ``"spanish"``, ``"chinese"``,'
'``"mongolian"``, ``"sakha"``, ``"dhivehi"``, ``"kinyarwanda"``, ``"swedish"``,'
'``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,'
'``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and'
'``"romansh sursilvan"``.'
)

basename = os.path.basename(url)
archive = os.path.join(root, basename)
Expand All @@ -194,12 +196,23 @@ def __init__(self,

self._path = os.path.join(root, folder_in_archive)

if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
if not os.path.isdir(self._path):
if os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
if checksum:
filepath = os.path.basename(url)
with open(filepath, "rb") as file_obj:
if not validate_file(file_obj, checksum, "sha256"):
raise RuntimeError(
f"The hash of {filepath} does not match. Delete the file manually and retry."
)
extract_archive(archive)
else:
raise RuntimeError(
"The dataset is no longer publicly accessible. You need to "
"download the archives externally and place them in the root "
"directory."
)

self._tsv = os.path.join(root, folder_in_archive, tsv)

Expand Down