In [1]:
from modules.decoders.ctc import greedy_decoder
from quantizer import Quantizer
import torch
import pickle

import torch
import torchaudio
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
import re
from collections import defaultdict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = torchaudio.datasets.TEDLIUM(
    root="datasets",
    release="release2",
    subset="train",
)

In [3]:
dataset[0]

(tensor([[-0.0107, -0.0120, -0.0123,  ..., -0.0042, -0.0031, -0.0022]]),
 16000,
 'today because of\n',
 '911Mothers_2010W',
 '911Mothers_2010W',
 '<o,f0,female>')

In [29]:
contra_dict = {}
contra_dict["are not"] = "aren 't"
contra_dict["can not"] = "can 't"
sentence = "you are not master."
sentence2 = "I am you 're it 's we 'll 'd\n"

In [39]:
contractions_count = defaultdict(lambda: 0)
for n, example in enumerate(dataset):
    # show progress
    if n % 100 == 0:
        print(f"Processed {n / len(dataset) * 100:.2f}% of the dataset")
        print(f"Found {len(contractions_count)} contractions so far")


    sentence = example[2]
    sentence = sentence.replace("\n", "")
    word_list = sentence.split(" ")
    for i, word in enumerate(word_list):
        if "'" in word:
            # 前方向に「 ' 」を含まない単語が登場するまで移動し、その単語の位置をstart_idxとする
            idx = i - 1
            while idx >= 0:
                if "'" not in word_list[idx]:
                    break
                idx -= 1
            start_idx = idx
            # 後方向に「 ' 」を含まない単語が登場するまで移動し、その単語の位置をend_idxとする
            idx = i + 1
            while idx < len(word_list):
                if "'" not in word_list[idx]:
                    break
                idx += 1
            end_idx = idx

            contractions_count[" ".join(word_list[start_idx:end_idx])] += 1


Processed 0.00% of the dataset
Found 0 contractions so far
Processed 0.11% of the dataset
Found 28 contractions so far
Processed 0.22% of the dataset
Found 61 contractions so far
Processed 0.32% of the dataset
Found 87 contractions so far
Processed 0.43% of the dataset
Found 88 contractions so far
Processed 0.54% of the dataset
Found 103 contractions so far
Processed 0.65% of the dataset
Found 115 contractions so far
Processed 0.75% of the dataset
Found 166 contractions so far


KeyboardInterrupt: 

In [23]:
import os
from pathlib import Path
from typing import Tuple, Union

import torchaudio
from torch import Tensor
from torch.hub import download_url_to_file
from torch.utils.data import Dataset
from torchaudio.datasets.utils import extract_archive
from collections import defaultdict

class TEDLIUMRelase2(Dataset):
    def __init__(
        self,
        talk_id: str,
        root: str = "datasets/TEDLIUM_release2",
        subset: str = "train",
    ) -> None:
        
        self._path = os.path.join(root, subset)
        # Create list for all samples
        self._lines = None
        self._talk_id = None
        stm_path = os.path.join(self._path, "stm")

        files = os.listdir(stm_path)
        if talk_id + ".stm" in files:
            stm_path = os.path.join(self._path, "stm", talk_id + ".stm")
            self._talk_id = talk_id
            with open(stm_path) as f:
                l = len(f.readlines())
                self._lines = list(range(l))
        else:
            raise ValueError("talk_id is not valid")
        # Create dict path for later read
        self._dict_path = os.path.join(root, "TEDLIUM.152k.dic")
        self._phoneme_dict = None

    def _load_tedlium_item(self, line: int) -> Tuple[Tensor, int, str, int, int, int]:
        """Loads a TEDLIUM dataset sample given a file name and corresponding sentence name.

        Args:
            line (int): Line identifier for the sample inside the text file

        Returns:
            (Tensor, int, str, int, int, int):
            ``(waveform, sample_rate, transcript, talk_id, speaker_id, identifier)``
        """
        transcript_path = os.path.join(self._path, "stm", self._talk_id) + ".stm"
        with open(transcript_path) as f:
            transcript = f.readlines()[line]
            talk_id, _, speaker_id, start_time, end_time, identifier, transcript = transcript.split(" ", 6)

        wave_path = os.path.join(self._path, "sph", self._talk_id) + ".sph"
        waveform, sample_rate = self._load_audio(wave_path, start_time=start_time, end_time=end_time)
        return (waveform, sample_rate, transcript, talk_id, speaker_id, identifier)

    def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]:
        """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality
        and load individual sentences from a full ted audio talk file.

        Args:
            path (str): Path to audio file
            start_time (int): Time in seconds where the sample sentence stars
            end_time (int): Time in seconds where the sample sentence finishes
            sample_rate (float, optional): Sampling rate

        Returns:
            [Tensor, int]: Audio tensor representation and sample rate
        """
        start_time = int(float(start_time) * sample_rate)
        end_time = int(float(end_time) * sample_rate)

        kwargs = {"frame_offset": start_time, "num_frames": end_time - start_time}

        return torchaudio.load(path, **kwargs)

    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
        """Load the n-th sample from the dataset.

        Args:
            n (int): The index of the sample to be loaded

        Returns:
            Tuple of the following items;

            Tensor:
                Waveform
            int:
                Sample rate
            str:
                Transcript
            int:
                Talk ID
            int:
                Speaker ID
            int:
                Identifier
        """
        line = self._lines[n]
        return self._load_tedlium_item(line)


    def __len__(self) -> int:
        """TEDLIUM dataset custom function overwritting len default behaviour.

        Returns:
            int: TEDLIUM dataset length
        """
        return len(self._lines)

    @property
    def phoneme_dict(self):
        """dict[str, tuple[str]]: Phonemes. Mapping from word to tuple of phonemes.
        Note that some words have empty phonemes.
        """
        # Read phoneme dictionary
        if not self._phoneme_dict:
            self._phoneme_dict = {}
            with open(self._dict_path, "r", encoding="utf-8") as f:
                for line in f.readlines():
                    content = line.strip().split()
                    self._phoneme_dict[content[0]] = tuple(content[1:])  # content[1:] can be empty list
        return self._phoneme_dict.copy()


In [21]:
dataset = TEDLIUMRelase2TextNormalized(talk_id="911Mothers_2010W", text_normalizer=TextNormalizer())

In [22]:
dataset[-1]

(tensor([[0.0105, 0.0092, 0.0077,  ..., 0.0023, 0.0046, 0.0074]]),
 16000,
 "and i hope that someday we 'll all live together in peace and respecting each other this is what i wanted to say",
 '911Mothers_2010W',
 '911Mothers_2010W',
 '<o,f0,female>')