Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 86 additions & 134 deletions torchaudio/datasets/commonvoice.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
import warnings
from pathlib import Path
from typing import List, Dict, Tuple, Union
from typing import List, Dict, Tuple, Union, Optional

import torchaudio
from torchaudio.datasets.utils import download_url, extract_archive, unicode_csv_reader
from torchaudio.datasets.utils import unicode_csv_reader
from torch import Tensor
from torch.utils.data import Dataset

Expand All @@ -16,68 +17,50 @@
# validated.tsv

FOLDER_IN_ARCHIVE = "CommonVoice"
URL = "english"
VERSION = "cv-corpus-4-2019-12-10"
TSV = "train.tsv"
_CHECKSUMS = {
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tt.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/en.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/de.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cy.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/br.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/tr.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ky.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ga-IE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/kab.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ca.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-TW.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/it.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/nl.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/cnh.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eo.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/et.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/fa.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/eu.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/es.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/zh-CN.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/mn.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sah.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/dv.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/rw.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/sv-SE.tar.gz":
None,
"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/cv-corpus-3/ru.tar.gz":
None

_LANG_CODE = {
"tatar": "tt",
"english": "en",
"german": "de",
"french": "fr",
"welsh": "cy",
"breton": "br",
"chuvash": "cv",
"turkish": "tr",
"kyrgyz": "ky",
"irish": "ga-IE",
"kabyle": "kab",
"catalan": "ca",
"taiwanese": "zh-TW",
"slovenian": "sl",
"italian": "it",
"dutch": "nl",
"hakha chin": "cnh",
"esperanto": "eo",
"estonian": "et",
"persian": "fa",
"portuguese": "pt",
"basque": "eu",
"spanish": "es",
"chinese": "zh-CN",
"mongolian": "mn",
"sakha": "sah",
"dhivehi": "dv",
"kinyarwanda": "rw",
"swedish": "sv-SE",
"russian": "ru",
"indonesian": "id",
"arabic": "ar",
"tamil": "ta",
"interlingua": "ia",
"latvian": "lv",
"japanese": "ja",
"votic": "vot",
"abkhaz": "ab",
"cantonese": "zh-HK",
"romansh sursilvan": "rm-sursilv"
}


Expand Down Expand Up @@ -107,8 +90,15 @@ class COMMONVOICE(Dataset):
root (str or Path): Path to the directory where the dataset is found or downloaded.
tsv (str, optional): The name of the tsv file used to construct the metadata.
(default: ``"train.tsv"``)
url (str, optional): The URL to download the dataset from, or the language of
the dataset to download. (default: ``"english"``).
url (str, optional): Language of dataset. Deprecated. Please use ``language``.
folder_in_archive (str, optional): The top-level directory of the dataset.
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
download (bool, optional):
Deprecated. CommonVoice requires user agreement on the usage term and torchaudio no longer
provides download functionality. Providing ``True`` results in error.
language (str, optional):
the language of the dataset to download. (default: ``"english"``).
Allowed language values are ``"tatar"``, ``"english"``, ``"german"``,
``"french"``, ``"welsh"``, ``"breton"``, ``"chuvash"``, ``"turkish"``, ``"kyrgyz"``,
``"irish"``, ``"kabyle"``, ``"catalan"``, ``"taiwanese"``, ``"slovenian"``,
Expand All @@ -118,11 +108,6 @@ class COMMONVOICE(Dataset):
``"russian"``, ``"indonesian"``, ``"arabic"``, ``"tamil"``, ``"interlingua"``,
``"latvian"``, ``"japanese"``, ``"votic"``, ``"abkhaz"``, ``"cantonese"`` and
``"romansh sursilvan"``.
folder_in_archive (str, optional): The top-level directory of the dataset.
version (str): Version string. (default: ``"cv-corpus-4-2019-12-10"``)
For the other allowed values, Please checkout https://commonvoice.mozilla.org/en/datasets.
download (bool, optional):
Whether to download the dataset if it is not found at root path. (default: ``False``).
"""

_ext_txt = ".txt"
Expand All @@ -132,79 +117,46 @@ class COMMONVOICE(Dataset):
def __init__(self,
root: Union[str, Path],
tsv: str = TSV,
url: str = URL,
url: Optional[str] = None,
folder_in_archive: str = FOLDER_IN_ARCHIVE,
version: str = VERSION,
download: bool = False) -> None:

languages = {
"tatar": "tt",
"english": "en",
"german": "de",
"french": "fr",
"welsh": "cy",
"breton": "br",
"chuvash": "cv",
"turkish": "tr",
"kyrgyz": "ky",
"irish": "ga-IE",
"kabyle": "kab",
"catalan": "ca",
"taiwanese": "zh-TW",
"slovenian": "sl",
"italian": "it",
"dutch": "nl",
"hakha chin": "cnh",
"esperanto": "eo",
"estonian": "et",
"persian": "fa",
"portuguese": "pt",
"basque": "eu",
"spanish": "es",
"chinese": "zh-CN",
"mongolian": "mn",
"sakha": "sah",
"dhivehi": "dv",
"kinyarwanda": "rw",
"swedish": "sv-SE",
"russian": "ru",
"indonesian": "id",
"arabic": "ar",
"tamil": "ta",
"interlingua": "ia",
"latvian": "lv",
"japanese": "ja",
"votic": "vot",
"abkhaz": "ab",
"cantonese": "zh-HK",
"romansh sursilvan": "rm-sursilv"
}

if url in languages:
ext_archive = ".tar.gz"
language = languages[url]

base_url = "https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com"
url = os.path.join(base_url, version, language + ext_archive)
download: Optional[bool] = None,
language: Optional[str] = None) -> None:
if download:
raise RuntimeError(
"Common Voice dataset requires user agreement on the usage term, "
"and torchaudio no longer provides the download feature. "
"Please download the dataset manually and extract it in the root directory, "
"then provide the target language to `url` argument.")
if download is not None: # download = False, which has no impact on funnctionality
warnings.warn(
"`download` argument is deprecated and will be removed in 0.9.0. "
"Please remove the argument.")
if url is not None and language is not None:
raise ValueError(
"`url` and `language` arguments can not be provided at the same time. "
"Please use `language`."
)
if url is not None:
warnings.warn(
"`url` argument is deprecated and will be removed in 0.9.0."
"Please use `language`.")
if url not in _LANG_CODE:
raise ValueError(f"`url` must be one of available languages: {_LANG_CODE.keys()}")
language = url
else:
language = language or 'english'
if language not in _LANG_CODE:
raise ValueError(
f"`language` must be one of available languages: {_LANG_CODE.keys()}")

# Get string representation of 'root' in case Path object is passed
root = os.fspath(root)

basename = os.path.basename(url)
archive = os.path.join(root, basename)

basename = basename.rsplit(".", 2)[0]
folder_in_archive = os.path.join(folder_in_archive, version, basename)
lang_code = _LANG_CODE[language]
folder_in_archive = os.path.join(folder_in_archive, version, lang_code)

self._path = os.path.join(root, folder_in_archive)

if download:
if not os.path.isdir(self._path):
if not os.path.isfile(archive):
checksum = _CHECKSUMS.get(url, None)
download_url(url, root, hash_value=checksum)
extract_archive(archive)

self._tsv = os.path.join(root, folder_in_archive, tsv)

with open(self._tsv, "r") as tsv:
Expand Down