diff --git a/torchaudio/datasets/commonvoice.py b/torchaudio/datasets/commonvoice.py index 963777cf38..8da2a16490 100644 --- a/torchaudio/datasets/commonvoice.py +++ b/torchaudio/datasets/commonvoice.py @@ -1,9 +1,10 @@ import os +import warnings from pathlib import Path -from typing import List, Dict, Tuple, Union +from typing import List, Dict, Tuple, Union, Optional import torchaudio -from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader +from torchaudio.datasets.utils import unicode_csv_reader from torch import Tensor from torch.utils.data import Dataset @@ -16,68 +17,50 @@ # validated.tsv FOLDER_IN_ARCHIVE = "CommonVoice" -URL = "english" VERSION = "cv-corpus-4-2019-12-10" TSV = "train.tsv" -_CHECKSUMS = { - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz": - None, - "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz": - None + +_LANG_CODE = { + "tatar": "tt", + "english": "en", + "german": "de", + "french": "fr", + "welsh": "cy", + "breton": "br", + "chuvash": "cv", + "turkish": "tr", + "kyrgyz": "ky", + "irish": "ga-IE", + "kabyle": "kab", + "catalan": "ca", + "taiwanese": "zh-TW", + "slovenian": "sl", + "italian": "it", + "dutch": "nl", + "hakha chin": "cnh", + "esperanto": "eo", + "estonian": "et", + "persian": "fa", + "portuguese": "pt", + "basque": "eu", + "spanish": "es", + "chinese": "zh-CN", + "mongolian": "mn", + "sakha": "sah", + "dhivehi": "dv", + "kinyarwanda": "rw", + "swedish": "sv-SE", + "russian": "ru", + "indonesian": "id", + "arabic": "ar", + "tamil": "ta", + "interlingua": "ia", + "latvian": "lv", + "japanese": "ja", + "votic": "vot", + "abkhaz": "ab", + "cantonese": "zh-HK", + "romansh sursilvan": "rm-sursilv" } @@ -107,8 +90,15 @@ class COMMONVOICE(Dataset): root (str or Path): Path to the directory where the dataset is found or downloaded. tsv (str, optional): The name of the tsv file used to construct the metadata. (default: ``"train.tsv"``) - url (str, optional): The URL to download the dataset from, or the language of - the dataset to download. (default: ``"english"``). + url (str, optional): Language of dataset. Deprecated. Please use ``language``. + folder_in_archive (str, optional): The top-level directory of the dataset. + version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``) + For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets. + download (bool, optional): + Deprecated. CommonVoice requires user agreement on the usage term and torchaudio no longer + provides download functionality. Providing ``True`` results in error. + language (str, optional): + the language of the dataset to download. (default: ``"english"``). Allowed language values are ``"tatar"``, ``"english"``, ``"german"``, ``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``, ``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``, @@ -118,11 +108,6 @@ class COMMONVOICE(Dataset): ``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``, ``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and ``"romansh sursilvan"``. - folder_in_archive (str, optional): The top-level directory of the dataset. - version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``) - For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets. - download (bool, optional): - Whether to download the dataset if it is not found at root path. (default: ``False``). """ _ext_txt = ".txt" @@ -132,79 +117,46 @@ class COMMONVOICE(Dataset): def __init__(self, root: Union[str, Path], tsv: str = TSV, - url: str = URL, + url: Optional[str] = None, folder_in_archive: str = FOLDER_IN_ARCHIVE, version: str = VERSION, - download: bool = False) -> None: - - languages = { - "tatar": "tt", - "english": "en", - "german": "de", - "french": "fr", - "welsh": "cy", - "breton": "br", - "chuvash": "cv", - "turkish": "tr", - "kyrgyz": "ky", - "irish": "ga-IE", - "kabyle": "kab", - "catalan": "ca", - "taiwanese": "zh-TW", - "slovenian": "sl", - "italian": "it", - "dutch": "nl", - "hakha chin": "cnh", - "esperanto": "eo", - "estonian": "et", - "persian": "fa", - "portuguese": "pt", - "basque": "eu", - "spanish": "es", - "chinese": "zh-CN", - "mongolian": "mn", - "sakha": "sah", - "dhivehi": "dv", - "kinyarwanda": "rw", - "swedish": "sv-SE", - "russian": "ru", - "indonesian": "id", - "arabic": "ar", - "tamil": "ta", - "interlingua": "ia", - "latvian": "lv", - "japanese": "ja", - "votic": "vot", - "abkhaz": "ab", - "cantonese": "zh-HK", - "romansh sursilvan": "rm-sursilv" - } - - if url in languages: - ext_archive = ".tar.gz" - language = languages[url] - - base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com" - url = os.path.join(base_url, version, language + ext_archive) + download: Optional[bool] = None, + language: Optional[str] = None) -> None: + if download: + raise RuntimeError( + "Common Voice dataset requires user agreement on the usage term, " + "and torchaudio no longer provides the download feature. " + "Please download the dataset manually and extract it in the root directory, " + "then provide the target language to `url` argument.") + if download is not None: # download = False, which has no impact on funnctionality + warnings.warn( + "`download` argument is deprecated and will be removed in 0.9.0. " + "Please remove the argument.") + if url is not None and language is not None: + raise ValueError( + "`url` and `language` arguments can not be provided at the same time. " + "Please use `language`." + ) + if url is not None: + warnings.warn( + "`url` argument is deprecated and will be removed in 0.9.0." + "Please use `language`.") + if url not in _LANG_CODE: + raise ValueError(f"`url` must be one of available languages: {_LANG_CODE.keys()}") + language = url + else: + language = language or 'english' + if language not in _LANG_CODE: + raise ValueError( + f"`language` must be one of available languages: {_LANG_CODE.keys()}") # Get string representation of 'root' in case Path object is passed root = os.fspath(root) - basename = os.path.basename(url) - archive = os.path.join(root, basename) - - basename = basename.rsplit(".", 2)[0] - folder_in_archive = os.path.join(folder_in_archive, version, basename) + lang_code = _LANG_CODE[language] + folder_in_archive = os.path.join(folder_in_archive, version, lang_code) self._path = os.path.join(root, folder_in_archive) - - if download: - if not os.path.isdir(self._path): - if not os.path.isfile(archive): - checksum = _CHECKSUMS.get(url, None) - download_url(url, root, hash_value=checksum) - extract_archive(archive) - self._tsv = os.path.join(root, folder_in_archive, tsv) with open(self._tsv, "r") as tsv: