From b695d9dabe091ffe32e66499c1558adbf9ab4236 Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Sat, 15 Aug 2020 13:18:40 +0200 Subject: [PATCH 01/16] Added tedlium support for 3 releases --- torchaudio/datasets/__init__.py | 4 +- torchaudio/datasets/tedlium.py | 133 ++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 torchaudio/datasets/tedlium.py diff --git a/torchaudio/datasets/__init__.py b/torchaudio/datasets/__init__.py index 187142db46..f53478f325 100644 --- a/torchaudio/datasets/__init__.py +++ b/torchaudio/datasets/__init__.py @@ -8,6 +8,7 @@ from .ljspeech import LJSPEECH from .cmuarctic import CMUARCTIC from .libritts import LIBRITTS +from .tedlium import TEDLIUM __all__ = ( "COMMONVOICE", @@ -18,7 +19,8 @@ "LJSPEECH", "GTZAN", "CMUARCTIC", - "LIBRITTS" + "LIBRITTS", "diskcache_iterator", "bg_iterator", + "TEDLIUM", ) diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py new file mode 100644 index 0000000000..31a3bd14f6 --- /dev/null +++ b/torchaudio/datasets/tedlium.py @@ -0,0 +1,133 @@ +import os +from typing import Tuple + +import torchaudio +from torch import Tensor +from torch.utils.data import Dataset +from torchaudio.datasets.utils import ( + download_url, + extract_archive, + walk_files, +) + +RELEASE = "release1" # Default release + +_RELEASE_CONFIGS = { + "release1": { + "folder_in_archive": "TEDLIUM_release1", + "url": "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz", + "checksum": "ffd31f96d81a21bf4928eaf9bb0b0c2dea7a5247", + "data_path": "", + "subset": "train", + }, + "release2": { + "folder_in_archive": "TEDLIUM_release2", + "url": "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz", + "checksum": "5c8fb045246d1c64296f57b47aa7dc79d16b184f", + "data_path": "", + "subset": "train", + }, + "release3": { + "folder_in_archive": "TEDLIUM_release-3", + "url": "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz", + "checksum": "685d27c39c53217383d7933cc405a07048004127", + "data_path": "data/", + "subset": None, + }, +} + + +def load_tedlium_item( + fileid: str, line: int, path: str, ext_audio: str, ext_txt: str +) -> Tuple[Tensor, int, str, int, int, int]: + transcript_path = os.path.join(path, "stm/", fileid) + with open(transcript_path + ext_txt) as f: + transcript = f.readlines()[line] + talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split( + " ", 6 + ) + + wave_path = os.path.join(path, "sph/", fileid) + waveform, sample_rate = torchaudio.load(wave_path + ext_audio) + print(wave_path + ext_audio) + # Calculate indexes for start time and endtime + start_time = int(float(start_time) * sample_rate) + end_time = int(float(end_time) * sample_rate) + print(start_time, end_time) + waveform = waveform[:, start_time:end_time] + return ( + waveform, + sample_rate, + transcript, + talk_id, + speaker_id, + identifier, + transcript, + ) + + +class TEDLIUM(Dataset): + """ + Create a Dataset for Tedlium. Each item is a tuple of the form: + waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id + """ + + _ext_txt = ".stm" + _ext_audio = ".sph" + _folder_audio = "sph/" + _folder_txt = "stm/" + + def __init__( + self, + root: str, + release: str = RELEASE, + subset: str = None, + folder_in_archive: str = _RELEASE_CONFIGS[RELEASE]["folder_in_archive"], + download: bool = False, + ) -> None: + + if release in _RELEASE_CONFIGS.keys(): + folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"] + url = _RELEASE_CONFIGS[release]["url"] + subset = subset if subset else _RELEASE_CONFIGS[release]["subset"] + else: + # Raise warning + raise RuntimeError( + "The release {} does not match any of the supported tedlium releases{} ".format( + filepath, _RELEASE_CONFIGS.keys(), + ) + ) + + basename = os.path.basename(url) + archive = os.path.join(root, basename) + + basename = basename.split(".")[0] + + self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"]) + if subset in ["train", "dev", "test"]: + self._path = os.path.join(self._path, subset + "/") + if download: + if not os.path.isdir(self._path): + if not os.path.isfile(archive): + checksum = _CHECKSUMS.get(url, None) + download_url(url, root, hash_value=checksum) + extract_archive(archive) + + walker = walk_files(self._path, suffix=self._ext_txt, prefix=False, remove_suffix=True) + self._walker = list(walker) + self._extended_walker = [] + for file in self._walker: + stm_path = os.path.join(self._path, self._folder_txt, file + self._ext_txt) + with open(stm_path) as f: + l = len(f.readlines()) + self._extended_walker += [(file, line) for line in range(l)] + + def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: + fileid, line = self._extended_walker[n] + return load_tedlium_item(fileid, line, self._path, self._ext_audio, self._ext_txt) + + def __len__(self) -> int: + return len(self._extended_walker) + + def _getdict(self): + return 1 From d684ac7985c2ed42d8f1b420363b7aa32cbf37da Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Mon, 17 Aug 2020 12:38:49 +0200 Subject: [PATCH 02/16] Minor fixes from PR feedback and better formatting --- torchaudio/datasets/tedlium.py | 79 +++++++++++++--------------------- 1 file changed, 31 insertions(+), 48 deletions(-) diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 31a3bd14f6..2f99e4c0da 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -37,35 +37,6 @@ } -def load_tedlium_item( - fileid: str, line: int, path: str, ext_audio: str, ext_txt: str -) -> Tuple[Tensor, int, str, int, int, int]: - transcript_path = os.path.join(path, "stm/", fileid) - with open(transcript_path + ext_txt) as f: - transcript = f.readlines()[line] - talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split( - " ", 6 - ) - - wave_path = os.path.join(path, "sph/", fileid) - waveform, sample_rate = torchaudio.load(wave_path + ext_audio) - print(wave_path + ext_audio) - # Calculate indexes for start time and endtime - start_time = int(float(start_time) * sample_rate) - end_time = int(float(end_time) * sample_rate) - print(start_time, end_time) - waveform = waveform[:, start_time:end_time] - return ( - waveform, - sample_rate, - transcript, - talk_id, - speaker_id, - identifier, - transcript, - ) - - class TEDLIUM(Dataset): """ Create a Dataset for Tedlium. Each item is a tuple of the form: @@ -74,17 +45,8 @@ class TEDLIUM(Dataset): _ext_txt = ".stm" _ext_audio = ".sph" - _folder_audio = "sph/" - _folder_txt = "stm/" - - def __init__( - self, - root: str, - release: str = RELEASE, - subset: str = None, - folder_in_archive: str = _RELEASE_CONFIGS[RELEASE]["folder_in_archive"], - download: bool = False, - ) -> None: + + def __init__(self, root: str, release: str = RELEASE, subset: str = None, download: bool = False) -> None: if release in _RELEASE_CONFIGS.keys(): folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"] @@ -94,7 +56,7 @@ def __init__( # Raise warning raise RuntimeError( "The release {} does not match any of the supported tedlium releases{} ".format( - filepath, _RELEASE_CONFIGS.keys(), + release, _RELEASE_CONFIGS.keys(), ) ) @@ -105,11 +67,11 @@ def __init__( self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"]) if subset in ["train", "dev", "test"]: - self._path = os.path.join(self._path, subset + "/") + self._path = os.path.join(self._path, subset) if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): - checksum = _CHECKSUMS.get(url, None) + checksum = _RELEASE_CONFIGS[release]["checksum"] download_url(url, root, hash_value=checksum) extract_archive(archive) @@ -117,17 +79,38 @@ def __init__( self._walker = list(walker) self._extended_walker = [] for file in self._walker: - stm_path = os.path.join(self._path, self._folder_txt, file + self._ext_txt) + stm_path = os.path.join(self._path, "stm", file + self._ext_txt) with open(stm_path) as f: l = len(f.readlines()) self._extended_walker += [(file, line) for line in range(l)] + def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, int, str, int, int, int]: + transcript_path = os.path.join(path, "stm", fileid) + with open(transcript_path + self._ext_txt) as f: + transcript = f.readlines()[line] + talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6) + + wave_path = os.path.join(path, "sph", fileid) + waveform, sample_rate = self.load_audio(wave_path + self._ext_audio) + # Calculate indexes for start time and endtime + start_time = int(float(start_time) * sample_rate) + end_time = int(float(end_time) * sample_rate) + waveform = waveform[:, start_time:end_time] + return ( + waveform, + sample_rate, + transcript, + talk_id, + speaker_id, + identifier, + ) + + def load_audio(self, path: str) -> [Tensor]: + return torchaudio.load(path) + def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: fileid, line = self._extended_walker[n] - return load_tedlium_item(fileid, line, self._path, self._ext_audio, self._ext_txt) + return self.load_tedlium_item(fileid, line, self._path) def __len__(self) -> int: return len(self._extended_walker) - - def _getdict(self): - return 1 From e1b3256ae450f4309115a4594d22dcaf0e3eb72a Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Mon, 17 Aug 2020 12:39:56 +0200 Subject: [PATCH 03/16] Minor fixes from PR feedback and better formatting --- torchaudio/datasets/tedlium.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 2f99e4c0da..2c10ff83d7 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -105,7 +105,7 @@ def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, identifier, ) - def load_audio(self, path: str) -> [Tensor]: + def load_audio(self, path: str) -> Tensor: return torchaudio.load(path) def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: From 3cde3ebbf99df328cc18bc13a46ba932583890a4 Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Mon, 17 Aug 2020 12:40:25 +0200 Subject: [PATCH 04/16] Minor fixes from PR feedback and better formatting --- torchaudio/datasets/tedlium.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 2c10ff83d7..66f08acbef 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -105,7 +105,7 @@ def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, identifier, ) - def load_audio(self, path: str) -> Tensor: + def load_audio(self, path: str) -> [Tensor, int]: return torchaudio.load(path) def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: From 9655036b9eb901b5e95db851bf5686e6485812f2 Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Mon, 17 Aug 2020 16:21:10 +0200 Subject: [PATCH 05/16] Minor fixes from PR feedback and docstrings --- torchaudio/datasets/tedlium.py | 86 +++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 22 deletions(-) diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 66f08acbef..683e139f66 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -9,6 +9,8 @@ extract_archive, walk_files, ) +from collections import namedtuple + RELEASE = "release1" # Default release @@ -16,38 +18,53 @@ "release1": { "folder_in_archive": "TEDLIUM_release1", "url": "http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz", - "checksum": "ffd31f96d81a21bf4928eaf9bb0b0c2dea7a5247", + "checksum": "30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27", "data_path": "", "subset": "train", }, "release2": { "folder_in_archive": "TEDLIUM_release2", "url": "http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz", - "checksum": "5c8fb045246d1c64296f57b47aa7dc79d16b184f", + "checksum": "93281b5fcaaae5c88671c9d000b443cb3c7ea3499ad12010b3934ca41a7b9c58", "data_path": "", "subset": "train", }, "release3": { "folder_in_archive": "TEDLIUM_release-3", "url": "http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz", - "checksum": "685d27c39c53217383d7933cc405a07048004127", + "checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb", "data_path": "data/", "subset": None, }, } +Tedlium_item = namedtuple( + "Tedlium_item", ["waveform", "sample_rate", "transcript", "talk_id", "speaker_id", "identifier"] +) + class TEDLIUM(Dataset): """ Create a Dataset for Tedlium. Each item is a tuple of the form: - waveform, sample_rate, utterance, speaker_id, chapter_id, utterance_id + [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] """ - _ext_txt = ".stm" - _ext_audio = ".sph" - - def __init__(self, root: str, release: str = RELEASE, subset: str = None, download: bool = False) -> None: - + def __init__( + self, root: str, release: str = RELEASE, subset: str = None, download: bool = False, audio_ext=".sph" + ) -> None: + """Constructor for TEDLIUM dataset + + Args: + root (str): Path containing dataset or target path where its downloaded if needed + release (str, optional): TEDLIUM identifier (release1,release2,release3). Defaults to RELEASE. + subset (str, optional): Subset of data(train,test,dev) supported for release 1 and 2. Defaults to Train/None. + download (bool, optional): Download dataset in case is not founded in root path. Defaults to False. + audio_ext (str, optional): Overwrite audio extension when loading items. Defaults to ".sph". + + Raises: + RuntimeError: If release identifier does not match any supported release, + """ + self._ext_audio = audio_ext if release in _RELEASE_CONFIGS.keys(): folder_in_archive = _RELEASE_CONFIGS[release]["folder_in_archive"] url = _RELEASE_CONFIGS[release]["url"] @@ -75,18 +92,28 @@ def __init__(self, root: str, release: str = RELEASE, subset: str = None, downlo download_url(url, root, hash_value=checksum) extract_archive(archive) - walker = walk_files(self._path, suffix=self._ext_txt, prefix=False, remove_suffix=True) + walker = walk_files(self._path, suffix=".stm", prefix=False, remove_suffix=True) self._walker = list(walker) self._extended_walker = [] for file in self._walker: - stm_path = os.path.join(self._path, "stm", file + self._ext_txt) + stm_path = os.path.join(self._path, "stm", file + ".stm") with open(stm_path) as f: l = len(f.readlines()) self._extended_walker += [(file, line) for line in range(l)] - def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, int, str, int, int, int]: + def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: + """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name + + Args: + fileid (str): File id to identify both text and audio files corresponding to the sample + line (int): Line identifier for the sample inside the text file + path (str): Dataset root path + + Returns: + Tedlium_item: A namedTuple containing [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] + """ transcript_path = os.path.join(path, "stm", fileid) - with open(transcript_path + self._ext_txt) as f: + with open(transcript_path + ".stm") as f: transcript = f.readlines()[line] talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6) @@ -96,21 +123,36 @@ def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, start_time = int(float(start_time) * sample_rate) end_time = int(float(end_time) * sample_rate) waveform = waveform[:, start_time:end_time] - return ( - waveform, - sample_rate, - transcript, - talk_id, - speaker_id, - identifier, - ) + return Tedlium_item(waveform, sample_rate, transcript, talk_id, speaker_id, identifier) def load_audio(self, path: str) -> [Tensor, int]: + """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality + + Args: + path (str): Path to audio file + + Returns: + [Tensor, int]: Audio tensor representation and sample rate + """ return torchaudio.load(path) - def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: + def __getitem__(self, n: int) -> Tedlium_item: + """TEDLIUM dataset custom function overwritting default loadbehaviour. + Loads a TEDLIUM sample given a index N + + Args: + n (int): Index of sample to be loaded + + Returns: + Tedlium_item: A namedTuple containing [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] + """ fileid, line = self._extended_walker[n] return self.load_tedlium_item(fileid, line, self._path) def __len__(self) -> int: + """DTEDLIUM dataset custom function overwritting len default behaviour. + + Returns: + int: TEDLIUM dataset length + """ return len(self._extended_walker) From e76ba7a809e8a066c214573ac63487cc0b1baab4 Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Mon, 17 Aug 2020 16:25:07 +0200 Subject: [PATCH 06/16] Style fix --- torchaudio/datasets/tedlium.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 683e139f66..24380ca6d2 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -57,7 +57,7 @@ def __init__( Args: root (str): Path containing dataset or target path where its downloaded if needed release (str, optional): TEDLIUM identifier (release1,release2,release3). Defaults to RELEASE. - subset (str, optional): Subset of data(train,test,dev) supported for release 1 and 2. Defaults to Train/None. + subset (str, optional): Subset of data(train,test,dev) supported for release 1,2. Defaults to Train/None. download (bool, optional): Download dataset in case is not founded in root path. Defaults to False. audio_ext (str, optional): Overwrite audio extension when loading items. Defaults to ".sph". From d3fede53c69980497b40184423b795004dd09a7a Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Tue, 18 Aug 2020 00:12:04 +0200 Subject: [PATCH 07/16] Changes from PR feedback --- torchaudio/datasets/tedlium.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 24380ca6d2..41fadc9ee4 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -92,14 +92,12 @@ def __init__( download_url(url, root, hash_value=checksum) extract_archive(archive) - walker = walk_files(self._path, suffix=".stm", prefix=False, remove_suffix=True) - self._walker = list(walker) - self._extended_walker = [] - for file in self._walker: + self._walker = [] + for file in walk_files(self._path, suffix=".stm", prefix=False, remove_suffix=True): stm_path = os.path.join(self._path, "stm", file + ".stm") with open(stm_path) as f: l = len(f.readlines()) - self._extended_walker += [(file, line) for line in range(l)] + self._walker.extend((file, line) for line in range(l)) def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name @@ -118,23 +116,26 @@ def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6) wave_path = os.path.join(path, "sph", fileid) - waveform, sample_rate = self.load_audio(wave_path + self._ext_audio) # Calculate indexes for start time and endtime - start_time = int(float(start_time) * sample_rate) - end_time = int(float(end_time) * sample_rate) - waveform = waveform[:, start_time:end_time] + start_time = int(float(start_time) * 16000) + end_time = int(float(end_time) * 16000) + waveform, sample_rate = self.load_audio( + wave_path + self._ext_audio, frame_offset=start_time, num_frames=end_time - start_time + ) return Tedlium_item(waveform, sample_rate, transcript, talk_id, speaker_id, identifier) - def load_audio(self, path: str) -> [Tensor, int]: + def load_audio(self, path: str, frame_offset: int = 0, num_frames: int = -1) -> [Tensor, int]: """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality Args: path (str): Path to audio file + frame_offset (int, optional): Number of frames to use as offstet when loading audio. Defaults to 0 + num_frames (int, optional): How many frames to load from the audio. Defaults to -1 Returns: [Tensor, int]: Audio tensor representation and sample rate """ - return torchaudio.load(path) + return torchaudio.load(path, frame_offset=frame_offset, num_frames=num_frames) def __getitem__(self, n: int) -> Tedlium_item: """TEDLIUM dataset custom function overwritting default loadbehaviour. @@ -146,7 +147,7 @@ def __getitem__(self, n: int) -> Tedlium_item: Returns: Tedlium_item: A namedTuple containing [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] """ - fileid, line = self._extended_walker[n] + fileid, line = self._walker[n] return self.load_tedlium_item(fileid, line, self._path) def __len__(self) -> int: @@ -155,4 +156,4 @@ def __len__(self) -> int: Returns: int: TEDLIUM dataset length """ - return len(self._extended_walker) + return len(self._walker) From 556a7f3df7194f46a8e7d2fdea7c4cbf1dc412c3 Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Tue, 18 Aug 2020 16:26:25 +0200 Subject: [PATCH 08/16] Changes from PR feedback and phoneme dict function --- torchaudio/datasets/tedlium.py | 47 +++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 41fadc9ee4..6fc4953ad7 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -12,8 +12,6 @@ from collections import namedtuple -RELEASE = "release1" # Default release - _RELEASE_CONFIGS = { "release1": { "folder_in_archive": "TEDLIUM_release1", @@ -21,6 +19,7 @@ "checksum": "30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27", "data_path": "", "subset": "train", + "dict": "TEDLIUM.150K.dic", }, "release2": { "folder_in_archive": "TEDLIUM_release2", @@ -28,6 +27,7 @@ "checksum": "93281b5fcaaae5c88671c9d000b443cb3c7ea3499ad12010b3934ca41a7b9c58", "data_path": "", "subset": "train", + "dict": "TEDLIUM.152k.dic", }, "release3": { "folder_in_archive": "TEDLIUM_release-3", @@ -35,6 +35,7 @@ "checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb", "data_path": "data/", "subset": None, + "dict": "TEDLIUM.152k.dic", }, } @@ -50,7 +51,7 @@ class TEDLIUM(Dataset): """ def __init__( - self, root: str, release: str = RELEASE, subset: str = None, download: bool = False, audio_ext=".sph" + self, root: str, release: str = "release1", subset: str = None, download: bool = False, audio_ext=".sph" ) -> None: """Constructor for TEDLIUM dataset @@ -93,11 +94,24 @@ def __init__( extract_archive(archive) self._walker = [] + + # Create walker for all samples for file in walk_files(self._path, suffix=".stm", prefix=False, remove_suffix=True): stm_path = os.path.join(self._path, "stm", file + ".stm") with open(stm_path) as f: l = len(f.readlines()) self._walker.extend((file, line) for line in range(l)) + # Read phoneme dictionary + dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"]) + self.phoneme_dict = {} + with open(dict_path, "rb") as f: + for line in f.readlines(): + content = line.decode("utf-8").strip("\n").split(" ", 1) + if len(content) > 1: + key, value = content[0], content[1] + self.phoneme_dict[key] = value.strip() + # # Some lines in release1 dont have a phoneme for a word so value will be out of index + # # Need to find a better solution to read the dictionary def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name @@ -116,26 +130,27 @@ def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6) wave_path = os.path.join(path, "sph", fileid) - # Calculate indexes for start time and endtime - start_time = int(float(start_time) * 16000) - end_time = int(float(end_time) * 16000) - waveform, sample_rate = self.load_audio( - wave_path + self._ext_audio, frame_offset=start_time, num_frames=end_time - start_time + waveform, sample_rate = self.__load_audio__( + wave_path + self._ext_audio, start_time=start_time, end_time=end_time ) + return Tedlium_item(waveform, sample_rate, transcript, talk_id, speaker_id, identifier) - def load_audio(self, path: str, frame_offset: int = 0, num_frames: int = -1) -> [Tensor, int]: + def __load_audio__(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]: """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality + and load individual sentnces from a full ted audio talk file Args: path (str): Path to audio file - frame_offset (int, optional): Number of frames to use as offstet when loading audio. Defaults to 0 - num_frames (int, optional): How many frames to load from the audio. Defaults to -1 + start_time (int, optional): Time in seconds where the sample sentence stars + end_time (int, optional): Time in seconds where the sample sentence finishes Returns: [Tensor, int]: Audio tensor representation and sample rate """ - return torchaudio.load(path, frame_offset=frame_offset, num_frames=num_frames) + start_time = int(float(start_time) * 16000) + end_time = int(float(end_time) * 16000) + return torchaudio.load(path, frame_offset=start_time, num_frames=end_time - start_time) def __getitem__(self, n: int) -> Tedlium_item: """TEDLIUM dataset custom function overwritting default loadbehaviour. @@ -157,3 +172,11 @@ def __len__(self) -> int: int: TEDLIUM dataset length """ return len(self._walker) + + def get_phoneme_dict(self): + """Returns the phoneme dictionary of a TEDLIUM release + + Returns: + dictionary: Phoneme dictionary for the current tedlium release + """ + return self.phoneme_dict From 3f186366a292d897c2b94e6d6e9f5aeb75a6fa8f Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Wed, 19 Aug 2020 03:01:34 +0200 Subject: [PATCH 09/16] Changes from PR feedback --- torchaudio/datasets/tedlium.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 6fc4953ad7..8ceafcb657 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -58,7 +58,7 @@ def __init__( Args: root (str): Path containing dataset or target path where its downloaded if needed release (str, optional): TEDLIUM identifier (release1,release2,release3). Defaults to RELEASE. - subset (str, optional): Subset of data(train,test,dev) supported for release 1,2. Defaults to Train/None. + subset (str, optional): train/dev/test for releases 1&2, None for release3. Defaults to Train/None download (bool, optional): Download dataset in case is not founded in root path. Defaults to False. audio_ext (str, optional): Overwrite audio extension when loading items. Defaults to ".sph". @@ -93,25 +93,24 @@ def __init__( download_url(url, root, hash_value=checksum) extract_archive(archive) + # Create walker for all samples self._walker = [] + stm_path = os.path.join(self._path, "stm") + for file in os.listdir(stm_path): + if file.endswith(".stm"): + stm_path = os.path.join(self._path, "stm", file) + with open(stm_path) as f: + l = len(f.readlines()) + file = file.replace(".stm", "") + self._walker.extend((file, line) for line in range(l)) - # Create walker for all samples - for file in walk_files(self._path, suffix=".stm", prefix=False, remove_suffix=True): - stm_path = os.path.join(self._path, "stm", file + ".stm") - with open(stm_path) as f: - l = len(f.readlines()) - self._walker.extend((file, line) for line in range(l)) # Read phoneme dictionary dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"]) self.phoneme_dict = {} - with open(dict_path, "rb") as f: + with open(dict_path, "r", encoding="utf-8") as f: for line in f.readlines(): - content = line.decode("utf-8").strip("\n").split(" ", 1) - if len(content) > 1: - key, value = content[0], content[1] - self.phoneme_dict[key] = value.strip() - # # Some lines in release1 dont have a phoneme for a word so value will be out of index - # # Need to find a better solution to read the dictionary + content = line.strip().split(maxsplit=1) + self.phoneme_dict[content[0]] = content[1:] # content[1:] can be empty list def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name @@ -130,13 +129,11 @@ def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6) wave_path = os.path.join(path, "sph", fileid) - waveform, sample_rate = self.__load_audio__( - wave_path + self._ext_audio, start_time=start_time, end_time=end_time - ) + waveform, sample_rate = self._load_audio(wave_path + self._ext_audio, start_time=start_time, end_time=end_time) return Tedlium_item(waveform, sample_rate, transcript, talk_id, speaker_id, identifier) - def __load_audio__(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]: + def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]: """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality and load individual sentnces from a full ted audio talk file From 8a0b922646f464860eae121b75de17a42208d387 Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Wed, 19 Aug 2020 03:02:07 +0200 Subject: [PATCH 10/16] Changes to dataset docs, adding tedlium --- docs/source/datasets.rst | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index 96ba045c5e..81f3144639 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -4,11 +4,11 @@ torchaudio.datasets All datasets are subclasses of :class:`torch.utils.data.Dataset` i.e, they have ``__getitem__`` and ``__len__`` methods implemented. Hence, they can all be passed to a :class:`torch.utils.data.DataLoader` -which can load multiple samples parallelly using ``torch.multiprocessing`` workers. +which can load multiple samples parallelly using ``torch.multiprocessing`` workers. For example: :: - + yesno_data = torchaudio.datasets.YESNO('.', download=True) - data_loader = torch.utils.data.DataLoader(yesno_data, + data_loader = torch.utils.data.DataLoader(yesno_data, batch_size=1, shuffle=True, num_workers=args.nThreads) @@ -22,7 +22,7 @@ All the datasets have almost similar API. They all have two common arguments: ``transform`` and ``target_transform`` to transform the input and target respectively. -.. currentmodule:: torchaudio.datasets +.. currentmodule:: torchaudio.datasets CMUARCTIC @@ -81,6 +81,13 @@ SPEECHCOMMANDS :special-members: +TEDLIUM +~~~~~~~~~~~~~~ + +.. autoclass:: TEDLIUM + :members: __getitem__ + :special-members: get_phoneme_dict + VCTK ~~~~ From 90d1db1f505bf812f271353418bd07008e31daba Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Thu, 20 Aug 2020 10:54:59 +0200 Subject: [PATCH 11/16] Changes from PR feedback --- torchaudio/datasets/tedlium.py | 41 +++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 8ceafcb657..2da13aea94 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -7,7 +7,6 @@ from torchaudio.datasets.utils import ( download_url, extract_archive, - walk_files, ) from collections import namedtuple @@ -19,6 +18,7 @@ "checksum": "30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27", "data_path": "", "subset": "train", + "supported_subsets": ["train", "test", "dev"], "dict": "TEDLIUM.150K.dic", }, "release2": { @@ -27,6 +27,7 @@ "checksum": "93281b5fcaaae5c88671c9d000b443cb3c7ea3499ad12010b3934ca41a7b9c58", "data_path": "", "subset": "train", + "supported_subsets": ["train", "test", "dev"], "dict": "TEDLIUM.152k.dic", }, "release3": { @@ -35,6 +36,7 @@ "checksum": "ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb", "data_path": "data/", "subset": None, + "supported_subsets": [None], "dict": "TEDLIUM.152k.dic", }, } @@ -46,8 +48,27 @@ class TEDLIUM(Dataset): """ - Create a Dataset for Tedlium. Each item is a tuple of the form: + Create a Dataset for Tedlium. It supports releases 1,2 and 3, each item is a tuple of the form: [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] + + Constructor arguments: + + Args: + root (str): Path containing dataset or target path where its downloaded if needed + release (str, optional): TEDLIUM identifier (release1,release2,release3). Defaults to RELEASE. + subset (str, optional): train/dev/test for releases 1&2, None for release3. Defaults to Train/None + download (bool, optional): Download dataset in case is not founded in root path. Defaults to False. + audio_ext (str, optional): Overwrite audio extension when loading items. Defaults to ".sph". + + Special functions: + + _load_tedlium_item: Loads a TEDLIUM dataset sample given a file name and corresponding sentence name + + _load_audio: Default load function used in TEDLIUM dataset, you can overwrite this function to customize + functionality and load individual sentences from a full ted audio talk file + + get_phoneme_dict: Returns the phoneme dictionary of a TEDLIUM release + """ def __init__( @@ -77,6 +98,13 @@ def __init__( release, _RELEASE_CONFIGS.keys(), ) ) + if subset not in _RELEASE_CONFIGS[release]["supported_subsets"]: + # Raise warning + raise RuntimeError( + "The subset {} does not match any of the supported tedlium subsets{} ".format( + subset, _RELEASE_CONFIGS[release]["supported_subsets"], + ) + ) basename = os.path.basename(url) archive = os.path.join(root, basename) @@ -86,6 +114,7 @@ def __init__( self._path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["data_path"]) if subset in ["train", "dev", "test"]: self._path = os.path.join(self._path, subset) + if download: if not os.path.isdir(self._path): if not os.path.isfile(archive): @@ -96,7 +125,7 @@ def __init__( # Create walker for all samples self._walker = [] stm_path = os.path.join(self._path, "stm") - for file in os.listdir(stm_path): + for file in sorted(os.listdir(stm_path)): if file.endswith(".stm"): stm_path = os.path.join(self._path, "stm", file) with open(stm_path) as f: @@ -112,7 +141,7 @@ def __init__( content = line.strip().split(maxsplit=1) self.phoneme_dict[content[0]] = content[1:] # content[1:] can be empty list - def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: + def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name Args: @@ -135,7 +164,7 @@ def load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]: """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality - and load individual sentnces from a full ted audio talk file + and load individual sentences from a full ted audio talk file Args: path (str): Path to audio file @@ -160,7 +189,7 @@ def __getitem__(self, n: int) -> Tedlium_item: Tedlium_item: A namedTuple containing [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] """ fileid, line = self._walker[n] - return self.load_tedlium_item(fileid, line, self._path) + return self._load_tedlium_item(fileid, line, self._path) def __len__(self) -> int: """DTEDLIUM dataset custom function overwritting len default behaviour. From 5125ebfc3cfced0af999951dd1e002140e696c8f Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Wed, 9 Sep 2020 15:47:47 +0200 Subject: [PATCH 12/16] Tedlium test and minor improvements to tedlium class --- .../datasets/tedlium_test.py | 93 +++++++++++++++++++ torchaudio/datasets/tedlium.py | 44 +++++---- 2 files changed, 114 insertions(+), 23 deletions(-) create mode 100644 test/torchaudio_unittest/datasets/tedlium_test.py diff --git a/test/torchaudio_unittest/datasets/tedlium_test.py b/test/torchaudio_unittest/datasets/tedlium_test.py new file mode 100644 index 0000000000..7aa8c03ce3 --- /dev/null +++ b/test/torchaudio_unittest/datasets/tedlium_test.py @@ -0,0 +1,93 @@ +import os + +from torchaudio.datasets import tedlium + +from torchaudio_unittest.common_utils import ( + TempDirMixin, + TorchaudioTestCase, + get_whitenoise, + save_wav, + normalize_wav, +) + +# Used to generate a unique utterance for each dummy audio file +UTTERANCES = [ + "AaronHuey_2010X 1 AaronHuey_2010X 0.0 2.0 script1\n", + "AaronHuey_2010X 1 AaronHuey_2010X 2.0 4.0 script2\n", + "AaronHuey_2010X 1 AaronHuey_2010X 4.0 6.0 script3\n", + "AaronHuey_2010X 1 AaronHuey_2010X 6.0 8.0 script4\n", + "AaronHuey_2010X 1 AaronHuey_2010X 8.0 10.0 script5\n", +] + + +class TestTedlium(TempDirMixin, TorchaudioTestCase): + backend = "default" + + root_dir = None + samples = [] + + @classmethod + def setUpClass(cls): + cls.root_dir = cls.get_base_temp_dir() + cls.root_dir = dataset_dir = os.path.join(cls.root_dir, "tedlium") + os.makedirs(dataset_dir, exist_ok=True) + sample_rate = 16000 # 16kHz + seed = 0 + data = get_whitenoise(sample_rate=sample_rate, duration=10.00, n_channels=1, dtype="float32", seed=seed) + for release in ["release1", "release2", "release3"]: + if release in ["release1", "release2"]: + release_dir = os.path.join( + dataset_dir, + tedlium._RELEASE_CONFIGS[release]["folder_in_archive"], + tedlium._RELEASE_CONFIGS[release]["subset"], + ) + else: + release_dir = os.path.join(dataset_dir, tedlium._RELEASE_CONFIGS[release]["folder_in_archive"]) + os.makedirs(release_dir, exist_ok=True) + os.makedirs(os.path.join(release_dir, "stm"), exist_ok=True) # Subfolder for transcripts + os.makedirs(os.path.join(release_dir, "sph"), exist_ok=True) # Subfolder for audio files + filename = f"{release}.sph" + path = os.path.join(os.path.join(release_dir, "sph"), filename) + save_wav(path, data, sample_rate) + + trans_filename = f"{release}.stm" + trans_path = os.path.join(os.path.join(release_dir, "stm"), trans_filename) + with open(trans_path, "w") as f: + f.write("".join(UTTERANCES)) + + # Create a samples list to compare with + for i, utterance in enumerate(UTTERANCES): + talk_id, _, speaker_id, start_time, end_time, identifier, transcript = utterance.split(" ", 6) + start_time = int(float(start_time)) * 16000 + end_time = int(float(end_time)) * 16000 + sample = ( + data[:, start_time:end_time], + sample_rate, + transcript, + talk_id, + speaker_id, + identifier, + ) + cls.samples.append(sample) + + @classmethod + def tearDownClass(cls): + # In case of test failure + tedlium.TEDLIUM._ext_audio = ".flac" + + def test_tedlium(self): + tedlium.TEDLIUM._ext_audio = ".wav" + dataset = tedlium.TEDLIUM(self.root_dir) + print(dataset._path) + num_samples = 0 + for i, (data, sample_rate, transcript, talk_id, speaker_id, identifier) in enumerate(dataset): + self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8) + assert sample_rate == self.samples[i][1] + assert transcript == self.samples[i][2] + assert talk_id == self.samples[i][3] + assert speaker_id == self.samples[i][4] + assert identifier == self.samples[i][5] + num_samples += 1 + + assert num_samples == len(self.samples) + tedlium.TEDLIUM._ext_audio = ".flac" diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 2da13aea94..cbcd01a473 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -8,7 +8,6 @@ download_url, extract_archive, ) -from collections import namedtuple _RELEASE_CONFIGS = { @@ -41,14 +40,10 @@ }, } -Tedlium_item = namedtuple( - "Tedlium_item", ["waveform", "sample_rate", "transcript", "talk_id", "speaker_id", "identifier"] -) - class TEDLIUM(Dataset): """ - Create a Dataset for Tedlium. It supports releases 1,2 and 3, each item is a tuple of the form: + Create a Dataset for Tedlium. It supports releases 1,2 and 3, each item is a list containings: [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] Constructor arguments: @@ -122,8 +117,8 @@ def __init__( download_url(url, root, hash_value=checksum) extract_archive(archive) - # Create walker for all samples - self._walker = [] + # Create list for all samples + self._filelist = [] stm_path = os.path.join(self._path, "stm") for file in sorted(os.listdir(stm_path)): if file.endswith(".stm"): @@ -131,17 +126,11 @@ def __init__( with open(stm_path) as f: l = len(f.readlines()) file = file.replace(".stm", "") - self._walker.extend((file, line) for line in range(l)) + self._filelist.extend((file, line) for line in range(l)) + # Create dict path for later read + self.dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"]) - # Read phoneme dictionary - dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"]) - self.phoneme_dict = {} - with open(dict_path, "r", encoding="utf-8") as f: - for line in f.readlines(): - content = line.strip().split(maxsplit=1) - self.phoneme_dict[content[0]] = content[1:] # content[1:] can be empty list - - def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: + def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, int, str, int, int, int]: """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name Args: @@ -160,7 +149,7 @@ def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tedlium_item: wave_path = os.path.join(path, "sph", fileid) waveform, sample_rate = self._load_audio(wave_path + self._ext_audio, start_time=start_time, end_time=end_time) - return Tedlium_item(waveform, sample_rate, transcript, talk_id, speaker_id, identifier) + return (waveform, sample_rate, transcript, talk_id, speaker_id, identifier) def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]: """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality @@ -176,9 +165,11 @@ def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate """ start_time = int(float(start_time) * 16000) end_time = int(float(end_time) * 16000) - return torchaudio.load(path, frame_offset=start_time, num_frames=end_time - start_time) + if torchaudio.get_audio_backend() == "sox_io": + return torchaudio.load(path, frame_offset=start_time, num_frames=end_time - start_time) + return torchaudio.load(path)[:, start_time:end_time] - def __getitem__(self, n: int) -> Tedlium_item: + def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: """TEDLIUM dataset custom function overwritting default loadbehaviour. Loads a TEDLIUM sample given a index N @@ -188,7 +179,7 @@ def __getitem__(self, n: int) -> Tedlium_item: Returns: Tedlium_item: A namedTuple containing [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] """ - fileid, line = self._walker[n] + fileid, line = self._filelist[n] return self._load_tedlium_item(fileid, line, self._path) def __len__(self) -> int: @@ -197,7 +188,7 @@ def __len__(self) -> int: Returns: int: TEDLIUM dataset length """ - return len(self._walker) + return len(self._filelist) def get_phoneme_dict(self): """Returns the phoneme dictionary of a TEDLIUM release @@ -205,4 +196,11 @@ def get_phoneme_dict(self): Returns: dictionary: Phoneme dictionary for the current tedlium release """ + # Read phoneme dictionary + if not hasattr(self, "phoneme_dict"): + self.phoneme_dict = {} + with open(self.dict_path, "r", encoding="utf-8") as f: + for line in f.readlines(): + content = line.strip().split(maxsplit=1) + self.phoneme_dict[content[0]] = content[1:] # content[1:] can be empty list return self.phoneme_dict From f6bae1cf49c68d73cba7c83ca6769d52246f9543 Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Thu, 10 Sep 2020 10:36:06 +0200 Subject: [PATCH 13/16] Created test for every release and improvements from PR feedback --- .../datasets/tedlium_test.py | 99 ++++++++++++------- torchaudio/datasets/tedlium.py | 12 ++- 2 files changed, 72 insertions(+), 39 deletions(-) diff --git a/test/torchaudio_unittest/datasets/tedlium_test.py b/test/torchaudio_unittest/datasets/tedlium_test.py index 7aa8c03ce3..cbaa9de522 100644 --- a/test/torchaudio_unittest/datasets/tedlium_test.py +++ b/test/torchaudio_unittest/datasets/tedlium_test.py @@ -24,7 +24,7 @@ class TestTedlium(TempDirMixin, TorchaudioTestCase): backend = "default" root_dir = None - samples = [] + samples = {} @classmethod def setUpClass(cls): @@ -33,8 +33,9 @@ def setUpClass(cls): os.makedirs(dataset_dir, exist_ok=True) sample_rate = 16000 # 16kHz seed = 0 - data = get_whitenoise(sample_rate=sample_rate, duration=10.00, n_channels=1, dtype="float32", seed=seed) + for release in ["release1", "release2", "release3"]: + data = get_whitenoise(sample_rate=sample_rate, duration=10.00, n_channels=1, dtype="float32", seed=seed) if release in ["release1", "release2"]: release_dir = os.path.join( dataset_dir, @@ -42,7 +43,11 @@ def setUpClass(cls): tedlium._RELEASE_CONFIGS[release]["subset"], ) else: - release_dir = os.path.join(dataset_dir, tedlium._RELEASE_CONFIGS[release]["folder_in_archive"]) + release_dir = os.path.join( + dataset_dir, + tedlium._RELEASE_CONFIGS[release]["folder_in_archive"], + tedlium._RELEASE_CONFIGS[release]["data_path"], + ) os.makedirs(release_dir, exist_ok=True) os.makedirs(os.path.join(release_dir, "stm"), exist_ok=True) # Subfolder for transcripts os.makedirs(os.path.join(release_dir, "sph"), exist_ok=True) # Subfolder for audio files @@ -55,39 +60,65 @@ def setUpClass(cls): with open(trans_path, "w") as f: f.write("".join(UTTERANCES)) - # Create a samples list to compare with - for i, utterance in enumerate(UTTERANCES): - talk_id, _, speaker_id, start_time, end_time, identifier, transcript = utterance.split(" ", 6) - start_time = int(float(start_time)) * 16000 - end_time = int(float(end_time)) * 16000 - sample = ( - data[:, start_time:end_time], - sample_rate, - transcript, - talk_id, - speaker_id, - identifier, - ) - cls.samples.append(sample) + # Create a samples list to compare with + cls.samples[release] = [] + for i, utterance in enumerate(UTTERANCES): + talk_id, _, speaker_id, start_time, end_time, identifier, transcript = utterance.split(" ", 6) + start_time = int(float(start_time)) * sample_rate + end_time = int(float(end_time)) * sample_rate + sample = ( + data[:, start_time:end_time], + sample_rate, + transcript, + talk_id, + speaker_id, + identifier, + ) + cls.samples[release].append(sample) + seed += 1 - @classmethod - def tearDownClass(cls): - # In case of test failure - tedlium.TEDLIUM._ext_audio = ".flac" - - def test_tedlium(self): - tedlium.TEDLIUM._ext_audio = ".wav" - dataset = tedlium.TEDLIUM(self.root_dir) - print(dataset._path) + def test_tedlium_release1(self): + release = "release1" + dataset = tedlium.TEDLIUM(self.root_dir, release=release) + num_samples = 0 + for i, (data, sample_rate, transcript, talk_id, speaker_id, identifier) in enumerate(dataset): + self.assertEqual(data, self.samples[release][i][0], atol=5e-5, rtol=1e-8) + assert sample_rate == self.samples[release][i][1] + assert transcript == self.samples[release][i][2] + assert talk_id == self.samples[release][i][3] + assert speaker_id == self.samples[release][i][4] + assert identifier == self.samples[release][i][5] + num_samples += 1 + + assert num_samples == len(self.samples[release]) + + def test_tedlium_release2(self): + release = "release2" + dataset = tedlium.TEDLIUM(self.root_dir, release=release) num_samples = 0 for i, (data, sample_rate, transcript, talk_id, speaker_id, identifier) in enumerate(dataset): - self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8) - assert sample_rate == self.samples[i][1] - assert transcript == self.samples[i][2] - assert talk_id == self.samples[i][3] - assert speaker_id == self.samples[i][4] - assert identifier == self.samples[i][5] + self.assertEqual(data, self.samples[release][i][0], atol=5e-5, rtol=1e-8) + assert sample_rate == self.samples[release][i][1] + assert transcript == self.samples[release][i][2] + assert talk_id == self.samples[release][i][3] + assert speaker_id == self.samples[release][i][4] + assert identifier == self.samples[release][i][5] num_samples += 1 - assert num_samples == len(self.samples) - tedlium.TEDLIUM._ext_audio = ".flac" + assert num_samples == len(self.samples[release]) + + def test_tedlium_release3(self): + release = "release3" + dataset = tedlium.TEDLIUM(self.root_dir, release=release) + num_samples = 0 + for i, (data, sample_rate, transcript, talk_id, speaker_id, identifier) in enumerate(dataset): + self.assertEqual(data, self.samples[release][i][0], atol=5e-5, rtol=1e-8) + assert sample_rate == self.samples[release][i][1] + assert transcript == self.samples[release][i][2] + assert talk_id == self.samples[release][i][3] + assert speaker_id == self.samples[release][i][4] + assert identifier == self.samples[release][i][5] + num_samples += 1 + + assert num_samples == len(self.samples[release]) + diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index cbcd01a473..520486d02a 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -128,7 +128,8 @@ def __init__( file = file.replace(".stm", "") self._filelist.extend((file, line) for line in range(l)) # Create dict path for later read - self.dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"]) + self._dict_path = os.path.join(root, folder_in_archive, _RELEASE_CONFIGS[release]["dict"]) + self._phoneme_dict = None def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, int, str, int, int, int]: """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name @@ -190,6 +191,7 @@ def __len__(self) -> int: """ return len(self._filelist) + @property def get_phoneme_dict(self): """Returns the phoneme dictionary of a TEDLIUM release @@ -197,10 +199,10 @@ def get_phoneme_dict(self): dictionary: Phoneme dictionary for the current tedlium release """ # Read phoneme dictionary - if not hasattr(self, "phoneme_dict"): - self.phoneme_dict = {} + if not self._phoneme_dict: + self._phoneme_dict = {} with open(self.dict_path, "r", encoding="utf-8") as f: for line in f.readlines(): content = line.strip().split(maxsplit=1) - self.phoneme_dict[content[0]] = content[1:] # content[1:] can be empty list - return self.phoneme_dict + self._phoneme_dict[content[0]] = content[1:] # content[1:] can be empty list + return self._phoneme_dict.copy() From 0dfda8d698b9d599448c9e624cbc22cf18a29ffd Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Fri, 11 Sep 2020 10:03:30 +0200 Subject: [PATCH 14/16] PR feedback changes --- .../datasets/tedlium_test.py | 2 +- torchaudio/datasets/tedlium.py | 20 +++++++++---------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/test/torchaudio_unittest/datasets/tedlium_test.py b/test/torchaudio_unittest/datasets/tedlium_test.py index cbaa9de522..f2e0f894c8 100644 --- a/test/torchaudio_unittest/datasets/tedlium_test.py +++ b/test/torchaudio_unittest/datasets/tedlium_test.py @@ -62,7 +62,7 @@ def setUpClass(cls): # Create a samples list to compare with cls.samples[release] = [] - for i, utterance in enumerate(UTTERANCES): + for utterance in UTTERANCES: talk_id, _, speaker_id, start_time, end_time, identifier, transcript = utterance.split(" ", 6) start_time = int(float(start_time)) * sample_rate end_time = int(float(end_time)) * sample_rate diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 520486d02a..0551b15154 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -44,7 +44,7 @@ class TEDLIUM(Dataset): """ Create a Dataset for Tedlium. It supports releases 1,2 and 3, each item is a list containings: - [waveform, sample_rate, transcript, talk_id, speaker_id, identifier] + [waveform, sample_rate, transcript, talk_id, speaker_id, identifier]. Constructor arguments: @@ -69,7 +69,7 @@ class TEDLIUM(Dataset): def __init__( self, root: str, release: str = "release1", subset: str = None, download: bool = False, audio_ext=".sph" ) -> None: - """Constructor for TEDLIUM dataset + """Constructor for TEDLIUM dataset. Args: root (str): Path containing dataset or target path where its downloaded if needed @@ -132,7 +132,7 @@ def __init__( self._phoneme_dict = None def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, int, str, int, int, int]: - """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name + """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name. Args: fileid (str): File id to identify both text and audio files corresponding to the sample @@ -154,7 +154,7 @@ def _load_tedlium_item(self, fileid: str, line: int, path: str) -> Tuple[Tensor, def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]: """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality - and load individual sentences from a full ted audio talk file + and load individual sentences from a full ted audio talk file. Args: path (str): Path to audio file @@ -171,8 +171,8 @@ def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate return torchaudio.load(path)[:, start_time:end_time] def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: - """TEDLIUM dataset custom function overwritting default loadbehaviour. - Loads a TEDLIUM sample given a index N + """TEDLIUM dataset custom function overwritting default loadbehaviour + Loads a TEDLIUM sample given a index N. Args: n (int): Index of sample to be loaded @@ -184,7 +184,7 @@ def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]: return self._load_tedlium_item(fileid, line, self._path) def __len__(self) -> int: - """DTEDLIUM dataset custom function overwritting len default behaviour. + """TEDLIUM dataset custom function overwritting len default behaviour. Returns: int: TEDLIUM dataset length @@ -192,8 +192,8 @@ def __len__(self) -> int: return len(self._filelist) @property - def get_phoneme_dict(self): - """Returns the phoneme dictionary of a TEDLIUM release + def phoneme_dict(self): + """Returns the phoneme dictionary of a TEDLIUM release. Returns: dictionary: Phoneme dictionary for the current tedlium release @@ -204,5 +204,5 @@ def get_phoneme_dict(self): with open(self.dict_path, "r", encoding="utf-8") as f: for line in f.readlines(): content = line.strip().split(maxsplit=1) - self._phoneme_dict[content[0]] = content[1:] # content[1:] can be empty list + self._phoneme_dict[content[0]] = tuple(content[1:]) # content[1:] can be empty list return self._phoneme_dict.copy() From eecd46ae3b1ae4dbf6736af13790967eb9621624 Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Fri, 11 Sep 2020 23:10:09 +0200 Subject: [PATCH 15/16] Test for dic loading and fix naming private variables --- .../datasets/tedlium_test.py | 30 +++++++++++++++++++ torchaudio/datasets/tedlium.py | 8 ++--- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/test/torchaudio_unittest/datasets/tedlium_test.py b/test/torchaudio_unittest/datasets/tedlium_test.py index f2e0f894c8..f916dc179e 100644 --- a/test/torchaudio_unittest/datasets/tedlium_test.py +++ b/test/torchaudio_unittest/datasets/tedlium_test.py @@ -19,6 +19,16 @@ "AaronHuey_2010X 1 AaronHuey_2010X 8.0 10.0 script5\n", ] +PHONEME = [ + "a AH", + "a(2) EY", + "aachen AA K AH N", + "aad AE D", + "aaden EY D AH N", + "aadmi AE D M IY", + "aae EY EY", +] + class TestTedlium(TempDirMixin, TorchaudioTestCase): backend = "default" @@ -60,6 +70,11 @@ def setUpClass(cls): with open(trans_path, "w") as f: f.write("".join(UTTERANCES)) + dict_filename = f"{release}.dic" + dict_path = os.path.join(release_dir, dict_filename) + with open(dict_path, "w") as f: + f.write("\n".join(PHONEME)) + # Create a samples list to compare with cls.samples[release] = [] for utterance in UTTERANCES: @@ -92,6 +107,11 @@ def test_tedlium_release1(self): assert num_samples == len(self.samples[release]) + dataset._dict_path = os.path.join(dataset._path, f"{release}.dic") + phoneme_dict = dataset.phoneme_dict + phoenemes = [f"{key} {' '.join(value)}" for key, value in phoneme_dict.items()] + assert phoenemes == PHONEME + def test_tedlium_release2(self): release = "release2" dataset = tedlium.TEDLIUM(self.root_dir, release=release) @@ -107,6 +127,11 @@ def test_tedlium_release2(self): assert num_samples == len(self.samples[release]) + dataset._dict_path = os.path.join(dataset._path, f"{release}.dic") + phoneme_dict = dataset.phoneme_dict + phoenemes = [f"{key} {' '.join(value)}" for key, value in phoneme_dict.items()] + assert phoenemes == PHONEME + def test_tedlium_release3(self): release = "release3" dataset = tedlium.TEDLIUM(self.root_dir, release=release) @@ -122,3 +147,8 @@ def test_tedlium_release3(self): assert num_samples == len(self.samples[release]) + dataset._dict_path = os.path.join(dataset._path, f"{release}.dic") + phoneme_dict = dataset.phoneme_dict + phoenemes = [f"{key} {' '.join(value)}" for key, value in phoneme_dict.items()] + assert phoenemes == PHONEME + diff --git a/torchaudio/datasets/tedlium.py b/torchaudio/datasets/tedlium.py index 0551b15154..4ae1ddeb0f 100644 --- a/torchaudio/datasets/tedlium.py +++ b/torchaudio/datasets/tedlium.py @@ -164,8 +164,8 @@ def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate Returns: [Tensor, int]: Audio tensor representation and sample rate """ - start_time = int(float(start_time) * 16000) - end_time = int(float(end_time) * 16000) + start_time = int(float(start_time) * sample_rate) + end_time = int(float(end_time) * sample_rate) if torchaudio.get_audio_backend() == "sox_io": return torchaudio.load(path, frame_offset=start_time, num_frames=end_time - start_time) return torchaudio.load(path)[:, start_time:end_time] @@ -201,8 +201,8 @@ def phoneme_dict(self): # Read phoneme dictionary if not self._phoneme_dict: self._phoneme_dict = {} - with open(self.dict_path, "r", encoding="utf-8") as f: + with open(self._dict_path, "r", encoding="utf-8") as f: for line in f.readlines(): - content = line.strip().split(maxsplit=1) + content = line.strip().split() self._phoneme_dict[content[0]] = tuple(content[1:]) # content[1:] can be empty list return self._phoneme_dict.copy() From b38a13c71b76adbeb2cfb106a1520455e62b5d1e Mon Sep 17 00:00:00 2001 From: Jaime Ferrando Date: Tue, 15 Sep 2020 00:29:07 +0200 Subject: [PATCH 16/16] fix style for tedlium test --- test/torchaudio_unittest/datasets/tedlium_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/torchaudio_unittest/datasets/tedlium_test.py b/test/torchaudio_unittest/datasets/tedlium_test.py index f916dc179e..c19984cac2 100644 --- a/test/torchaudio_unittest/datasets/tedlium_test.py +++ b/test/torchaudio_unittest/datasets/tedlium_test.py @@ -151,4 +151,3 @@ def test_tedlium_release3(self): phoneme_dict = dataset.phoneme_dict phoenemes = [f"{key} {' '.join(value)}" for key, value in phoneme_dict.items()] assert phoenemes == PHONEME -