diff --git a/test/torchaudio_unittest/datasets/commonvoice_test.py b/test/torchaudio_unittest/datasets/commonvoice_test.py index 731c4e07cc..5aa7c427df 100644 --- a/test/torchaudio_unittest/datasets/commonvoice_test.py +++ b/test/torchaudio_unittest/datasets/commonvoice_test.py @@ -1,6 +1,6 @@ import os import csv -import random +import tarfile from pathlib import Path from torchaudio.datasets import commonvoice @@ -12,64 +12,115 @@ normalize_wav, ) +_HEADERS = [ + "client_ids", + "path", + "sentence", + "up_votes", + "down_votes", + "age", + "gender", + "accent", +] + +# Note: extension is changed to wav for the sake of test +# Note: the first content is missing values for `age`, `gender` and `accent` as in the original data. +_TRAIN_CSV_CONTENTS = [ + [ + "9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c", + "common_voice_en_18885784.wav", + "He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.", + "2", + "0", + "", + "", + "" + ], + [ + "c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20", + "common_voice_en_556542.wav", + "Once more into the breach", + "2", + "0", + "thirties", + "male", + "us", + ], + [ + "f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c", + "common_voice_en_18607573.wav", + "Caddy, show Miss Clare and Miss Summerson their rooms.", + "2", + "0", + "twenties", + "male", + "canada", + ], +] + + +def _make_dataset(root_dir, sample_rate=48000): + # The path convention commonvoice uses + base_dir = os.path.join(root_dir, "CommonVoice", "cv-corpus-4-2019-12-10", "en") + audio_dir = os.path.join(base_dir, "clips") + tsv_path = os.path.join(base_dir, "train.tsv") + + os.makedirs(base_dir, exist_ok=True) + os.makedirs(audio_dir, exist_ok=True) + + # Tsv file name difference does not mean different subset, testing as a whole dataset here + print(tsv_path) + with open(tsv_path, "w", newline='') as tsv: + writer = csv.writer(tsv, delimiter='\t') + writer.writerow(_HEADERS) + for content in _TRAIN_CSV_CONTENTS: + writer.writerow(content) + + # Generate audio files + expected = [] + for i, content in enumerate(_TRAIN_CSV_CONTENTS): + audio_path = os.path.join(audio_dir, content[1]) + data = get_whitenoise( + sample_rate=sample_rate, duration=1, n_channels=1, seed=i, dtype='float32') + save_wav(audio_path, data, sample_rate) + print(audio_path) + expected.append((normalize_wav(data), sample_rate, dict(zip(_HEADERS, content)))) + return expected + + +def _make_tarfile(output_filename, source_dir): + with tarfile.open(output_filename, "w:gz") as tar: + tar.add(source_dir, arcname=os.path.basename(source_dir)) + class TestCommonVoice(TempDirMixin, TorchaudioTestCase): backend = 'default' - root_dir = None - data = [] - _headers = [u"client_ids", u"path", u"sentence", u"up_votes", u"down_votes", u"age", u"gender", u"accent"] - # Note: extension is changed to wav for the sake of test - # Note: the first content is missing values for `age`, `gender` and `accent` as in the original data. - _train_csv_contents = [ - ["9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c", - "common_voice_en_18885784.wav", - "He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.", "2", "0", "", "", ""], - ["c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20", - "common_voice_en_556542.wav", "Once more into the breach", "2", "0", "thirties", "male", "us"], - ["f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c", - "common_voice_en_18607573.wav", - "Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"], - ] - _folder_audio = "clips" - sample_rate = 48000 + root_dir = "" + expected = [] @classmethod def setUpClass(cls): - cls.root_dir = cls.get_base_temp_dir() - # The path convention commonvoice uses - base_dir = os.path.join(cls.root_dir, commonvoice.FOLDER_IN_ARCHIVE, commonvoice.VERSION, "en") - os.makedirs(base_dir, exist_ok=True) - - # Tsv file name difference does not mean different subset, testing as a whole dataset here - tsv_filename = os.path.join(base_dir, commonvoice.TSV) - with open(tsv_filename, "w", newline='') as tsv: - writer = csv.writer(tsv, delimiter='\t') - writer.writerow(cls._headers) - for i, content in enumerate(cls._train_csv_contents): - audio_filename = audio_filename = content[1] - writer.writerow(content) - - # Generate and store audio - audio_base_path = os.path.join(base_dir, cls._folder_audio) - os.makedirs(audio_base_path, exist_ok=True) - audio_path = os.path.join(audio_base_path, audio_filename) - data = get_whitenoise(sample_rate=cls.sample_rate, duration=1, n_channels=1, seed=i, dtype='float32') - save_wav(audio_path, data, cls.sample_rate) - - # Append data entry - cls.data.append((normalize_wav(data), cls.sample_rate, dict(zip(cls._headers, content)))) + root_dir = cls.get_base_temp_dir() + tmp_dir = os.path.join(root_dir, 'tmp') + expected = _make_dataset(tmp_dir) + source_dir = os.path.join(tmp_dir, 'CommonVoice') + arch_path = os.path.join(root_dir, 'en.tar.gz') + _make_tarfile(arch_path, source_dir) + + cls.root_dir = root_dir + cls.expected = expected def _test_commonvoice(self, dataset): n_ite = 0 for i, (waveform, sample_rate, dictionary) in enumerate(dataset): - expected_dictionary = self.data[i][2] - expected_data = self.data[i][0] + expected_dictionary = self.expected[i][2] + expected_data = self.expected[i][0] self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8) - assert sample_rate == TestCommonVoice.sample_rate + assert sample_rate == 48000 assert dictionary == expected_dictionary n_ite += 1 - assert n_ite == len(self.data) + assert n_ite == len(self.expected) def test_commonvoice_str(self): dataset = commonvoice.COMMONVOICE(self.root_dir) diff --git a/test/torchaudio_unittest/datasets/utils_test.py b/test/torchaudio_unittest/datasets/utils_test.py index a9e13a1e38..f75ae2319c 100644 --- a/test/torchaudio_unittest/datasets/utils_test.py +++ b/test/torchaudio_unittest/datasets/utils_test.py @@ -54,7 +54,7 @@ class TestIterator(TorchaudioTestCase): path = get_asset_path() def test_disckcache_iterator(self): - data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar") + data = COMMONVOICE(self.path, url="tatar") data = dataset_utils.diskcache_iterator(data) # Save data[0] @@ -62,7 +62,7 @@ def test_disckcache_iterator(self): data[0] def test_bg_iterator(self): - data = COMMONVOICE(self.path, version="cv-corpus-4-2019-12-10", language="tatar") + data = COMMONVOICE(self.path, url="tatar") data = dataset_utils.bg_iterator(data, 5) for _ in data: pass diff --git a/torchaudio/datasets/commonvoice.py b/torchaudio/datasets/commonvoice.py index 5899e3a4ac..ffeb9b908d 100644 --- a/torchaudio/datasets/commonvoice.py +++ b/torchaudio/datasets/commonvoice.py @@ -1,10 +1,10 @@ import os import warnings from pathlib import Path -from typing import List, Dict, Tuple, Optional, Union +from typing import List, Dict, Tuple, Union, Optional import torchaudio -from torchaudio.datasets.utils import extract_archive, unicode_csv_reader, validate_file +from torchaudio.datasets.utils import extract_archive, unicode_csv_reader from torch import Tensor from torch.utils.data import Dataset @@ -17,39 +17,51 @@ # validated.tsv FOLDER_IN_ARCHIVE = "CommonVoice" -LANGUAGE = "english" -VERSION = "cv-corpus-5.1-2020-06-22" +URL = "english" +VERSION = "cv-corpus-4-2019-12-10" TSV = "train.tsv" -_CHECKSUMS = { - "cv-corpus-5.1-2020-06-22/tt.tar.gz": None, - "cv-corpus-5.1-2020-06-22/en.tar.gz": None, - "cv-corpus-5.1-2020-06-22/de.tar.gz": None, - "cv-corpus-5.1-2020-06-22/fr.tar.gz": None, - "cv-corpus-5.1-2020-06-22/cy.tar.gz": None, - "cv-corpus-5.1-2020-06-22/br.tar.gz": None, - "cv-corpus-5.1-2020-06-22/cv.tar.gz": None, - "cv-corpus-5.1-2020-06-22/tr.tar.gz": None, - "cv-corpus-5.1-2020-06-22/ky.tar.gz": None, - "cv-corpus-5.1-2020-06-22/ga-IE.tar.gz": None, - "cv-corpus-5.1-2020-06-22/kab.tar.gz": None, - "cv-corpus-5.1-2020-06-22/ca.tar.gz": None, - "cv-corpus-5.1-2020-06-22/zh-TW.tar.gz": None, - "cv-corpus-5.1-2020-06-22/sl.tar.gz": None, - "cv-corpus-5.1-2020-06-22/it.tar.gz": None, - "cv-corpus-5.1-2020-06-22/nl.tar.gz": None, - "cv-corpus-5.1-2020-06-22/cnh.tar.gz": None, - "cv-corpus-5.1-2020-06-22/eo.tar.gz": None, - "cv-corpus-5.1-2020-06-22/et.tar.gz": None, - "cv-corpus-5.1-2020-06-22/fa.tar.gz": None, - "cv-corpus-5.1-2020-06-22/eu.tar.gz": None, - "cv-corpus-5.1-2020-06-22/es.tar.gz": None, - "cv-corpus-5.1-2020-06-22/zh-CN.tar.gz": None, - "cv-corpus-5.1-2020-06-22/mn.tar.gz": None, - "cv-corpus-5.1-2020-06-22/sah.tar.gz": None, - "cv-corpus-5.1-2020-06-22/dv.tar.gz": None, - "cv-corpus-5.1-2020-06-22/rw.tar.gz": None, - "cv-corpus-5.1-2020-06-22/sv-SE.tar.gz": None, - "cv-corpus-5.1-2020-06-22/ru.tar.gz": None, + +_LANG_CODE = { + "tatar": "tt", + "english": "en", + "german": "de", + "french": "fr", + "welsh": "cy", + "breton": "br", + "chuvash": "cv", + "turkish": "tr", + "kyrgyz": "ky", + "irish": "ga-IE", + "kabyle": "kab", + "catalan": "ca", + "taiwanese": "zh-TW", + "slovenian": "sl", + "italian": "it", + "dutch": "nl", + "hakha chin": "cnh", + "esperanto": "eo", + "estonian": "et", + "persian": "fa", + "portuguese": "pt", + "basque": "eu", + "spanish": "es", + "chinese": "zh-CN", + "mongolian": "mn", + "sakha": "sah", + "dhivehi": "dv", + "kinyarwanda": "rw", + "swedish": "sv-SE", + "russian": "ru", + "indonesian": "id", + "arabic": "ar", + "tamil": "ta", + "interlingua": "ia", + "latvian": "lv", + "japanese": "ja", + "votic": "vot", + "abkhaz": "ab", + "cantonese": "zh-HK", + "romansh sursilvan": "rm-sursilv" } @@ -73,17 +85,22 @@ def load_commonvoice_item(line: List[str], class COMMONVOICE(Dataset): - """Create a Dataset for `CommonVoice `_. + """Create a Dataset for CommonVoice. Args: root (str or Path): Path to the directory where the dataset is found or downloaded. tsv (str, optional): The name of the tsv file used to construct the metadata. (default: ``"train.tsv"``) - url (str, optional): Deprecated. + url (str, optional): The URL to download the dataset from, or the language of + the dataset to download. Refert to ``language`` argument for the allowed value of + language. (default: ``"english"``). folder_in_archive (str, optional): The top-level directory of the dataset. - version (str): Version string. (default: ``"cv-corpus-5.1-2020-06-22"``) - language (str, optional): Language of the dataset. (default: None) - The following values are mapped to their corresponding shortened version: + version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``) + For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets. + download (bool, optional): + Whether to download the dataset if it is not found at root path. (default: ``False``). + language (str, optional): + The language of the dataset. Allowed language values are ``"tatar"``, ``"english"``, ``"german"``, ``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``, ``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``, @@ -92,9 +109,7 @@ class COMMONVOICE(Dataset): ``"mongolian"``, ``"sakha"``, ``"dhivehi"``, ``"kinyarwanda"``, ``"swedish"``, ``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``, ``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and - ``"romansh sursilvan"``. - For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets. - download (bool, optional): Deprecated. + ``"romansh sursilvan"``. (default: ``"english"``) """ _ext_txt = ".txt" @@ -107,116 +122,54 @@ def __init__(self, url: Optional[str] = None, folder_in_archive: str = FOLDER_IN_ARCHIVE, version: str = VERSION, - language: str = LANGUAGE, - download: Optional[bool] = False) -> None: + download: Optional[bool] = None, + language: Optional[str] = None) -> None: - if download is True: + if download: raise RuntimeError( - "The dataset is no longer publicly accessible. You need to " - "download the archives externally and place them in the root " - "directory." - ) - elif download is False: + "Common Voice dataset requires user agreement on the usage term, " + "and torchaudio no longer provides the download feature. " + "Please download the dataseet manually and place it in the root directory, " + "then provide the target language to `language` argument.") + + if download is not None: # User provided download=False warnings.warn( - "The use of the download flag is deprecated, since the dataset " - "is no longer directly accessible.", RuntimeWarning + "`download` argument is deprecated and will be removed in 0.9.0 release. " + "Please remove the argument.") + + if url is not None and language is not None: + raise ValueError( + "Both `url` and `langauge` arguments cannot be provided at the same time. " + "Please use `language`." ) if url is not None: warnings.warn( - "The use of the url flag is deprecated, since the dataset " - "is no longer publicly accessible. To specify the language of the dataset, " - "please use the language parameter instead.", RuntimeWarning - ) - - languages = { - "tatar": "tt", - "english": "en", - "german": "de", - "french": "fr", - "welsh": "cy", - "breton": "br", - "chuvash": "cv", - "turkish": "tr", - "kyrgyz": "ky", - "irish": "ga-IE", - "kabyle": "kab", - "catalan": "ca", - "taiwanese": "zh-TW", - "slovenian": "sl", - "italian": "it", - "dutch": "nl", - "hakha chin": "cnh", - "esperanto": "eo", - "estonian": "et", - "persian": "fa", - "portuguese": "pt", - "basque": "eu", - "spanish": "es", - "chinese": "zh-CN", - "mongolian": "mn", - "sakha": "sah", - "dhivehi": "dv", - "kinyarwanda": "rw", - "swedish": "sv-SE", - "russian": "ru", - "indonesian": "id", - "arabic": "ar", - "tamil": "ta", - "interlingua": "ia", - "latvian": "lv", - "japanese": "ja", - "votic": "vot", - "abkhaz": "ab", - "cantonese": "zh-HK", - "romansh sursilvan": "rm-sursilv" - } - - if language in languages: - ext_archive = ".tar.gz" - language = languages[language] - url = os.path.join(version, language + ext_archive) - else: - raise ValueError( - 'Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,' - '``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,' - '``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,' - '``"italian"``, ``"dutch"``, ``"hakha chin"``, ``"esperanto"``, ``"estonian"``,' - '``"persian"``, ``"portuguese"``, ``"basque"``, ``"spanish"``, ``"chinese"``,' - '``"mongolian"``, ``"sakha"``, ``"dhivehi"``, ``"kinyarwanda"``, ``"swedish"``,' - '``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,' - '``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and' - '``"romansh sursilvan"``.' + "`url` argument is deprecated and will be removed in 0.9.0 release. " + "Pleaes use `language`." ) + if url not in _LANG_CODE: + raise ValueError(f"`url` must be one of available languages: {_LANG_CODE.keys()}") + language = url + elif language is None: + language = 'english' + elif language not in _LANG_CODE: + raise ValueError(f"`language` must be one of available languages: {_LANG_CODE.keys()}") # Get string representation of 'root' in case Path object is passed root = os.fspath(root) - basename = os.path.basename(url) - archive = os.path.join(root, basename) - - basename = basename.rsplit(".", 2)[0] - folder_in_archive = os.path.join(folder_in_archive, version, basename) + lang_code = _LANG_CODE[language] + archive_name = f"{lang_code}.tar.gz" + archive = os.path.join(root, archive_name) + folder_in_archive = os.path.join(folder_in_archive, version, lang_code) self._path = os.path.join(root, folder_in_archive) if not os.path.isdir(self._path): - if os.path.isfile(archive): - checksum = _CHECKSUMS.get(url, None) - if checksum: - filepath = os.path.basename(url) - with open(filepath, "rb") as file_obj: - if not validate_file(file_obj, checksum, "sha256"): - raise RuntimeError( - f"The hash of {filepath} does not match. Delete the file manually and retry." - ) - extract_archive(archive) - else: - raise RuntimeError( - "The dataset is no longer publicly accessible. You need to " - "download the archives externally and place them in the root " - "directory." - ) + if not os.path.isfile(archive): + raise RuntimeError(f"Archive `{archive_name}` is not found in the root directory {root}") + extract_archive(archive) self._tsv = os.path.join(root, folder_in_archive, tsv)