## Analysis of ERSR datasets

In [1]:
import re

from datasets import load_dataset, concatenate_datasets
import datasets

import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


### Load all training datasets

Note: for debugging/prototyping, only the first 10 samples of each datasets are currently loaded.

In [2]:
# load all LS (clean, other) as one
librispeech = load_dataset("librispeech_asr", "all", split="train.clean.100[:10]+train.clean.360[:10]+train.other.500[:10]", use_auth_token=True)

common_voice_9_0 = load_dataset("mozilla-foundation/common_voice_9_0", "en", split="train[:10]", use_auth_token=True)

# load vox from xtreme-s split
voxpopuli = load_dataset("google/xtreme_s", "voxpopuli.en", split="train[:10]", use_auth_token=True)

tedlium = load_dataset("LIUM/tedlium", "release3", split="train[:10]", use_auth_token=True)

gigaspeech = load_dataset("speechcolab/gigaspeech", "l", split="train[:10]", use_auth_token=True)

earnings22 = load_dataset("sanchit-gandhi/earnings22_robust_split", split="train[:10]", use_auth_token=True)

# pin revision of kensho to avoid re-processing dataset after new modifications to spgispeech.py
spgispeech = load_dataset("kensho/spgispeech", "L", split="train[:10]", use_auth_token=True, revision="f4d7d3b3f9b66414a09532ec937e285197afeaf6")

switchboard = load_dataset("ldc/switchboard", "switchboard", split="train[:10]", use_auth_token=True)

train_datasets = [librispeech, common_voice_9_0, voxpopuli, tedlium, gigaspeech, earnings22, spgispeech, switchboard]
ds_name = ["librispeech", "common_voice_9_0", "voxpopuli", "tedlium", "gigaspeech", "earnings22", "spgispeech", "switchboard"]

# define text/id column names for each dataset
transcript_column_names = ['text', 'sentence', 'transcription', 'text', 'text', 'sentence', 'transcript', 'text']
id_column_names = ['id', 'client_id', 'id', 'id', 'segment_id', 'source_id', 'wav_filename', 'id']
# whether to lower case each dataset
do_lower_cases = [True, False, True, True, True, False, False, True]


# define our 'error corrections' labels
tedlium_contractions = [" 's", " 't", " 're", " 've", " 'm", " 'll", " 'd", " 'clock", " 'all"]
gigaspeech_punctuation = {" <comma>": ",", " <period>": ".", " <questionmark>": "?", " <exclamationpoint>": "!"}
gigaspeech_disfluencies = ["<other>", "<sil>"]
swb_disfluencies = ["[noise]", "[laughter]", "[silence]", "<a_aside>", "<b_aside>", "<e_aside>", "[laughter-",
                    "[vocalized-noise]", "_1"]
swb_punctuations = ["{", "}", "[", "]-", "]"]
earnings_disfluencies = ["<crosstalk>", "<affirmative>", "<inaudible>", "inaudible", "<laugh>"]
ignore_segments = ["ignore_time_segment_in_scoring", "<noise>", "<music>", "[noise]", "[laughter]", "[silence]",
                   "[vocalized-noise]", "<crosstalk>", "<affirmative>", "<inaudible>", "<laugh>", "<other>",
                   "<sil>", ""]

Reusing dataset librispeech_asr (/home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb)
Reusing dataset common_voice_9_0 (/home/sanchit_huggingface_co/.cache/huggingface/datasets/mozilla-foundation___common_voice_9_0/en/9.0.0/c8491634a4579fef5745ab949ee9aa4265b7203d7e2ecf44f45879a6419cd40d)
Reusing dataset xtreme_s (/home/sanchit_huggingface_co/.cache/huggingface/datasets/google___xtreme_s/voxpopuli.en/2.0.0/1384f19b49cc1beade2a9bf2ca44abe870cd95f85819a16f6f44671d4fdad7e2)
Reusing dataset tedlium (/home/sanchit_huggingface_co/.cache/huggingface/datasets/LIUM___tedlium/release3/1.0.1/3534cf671f9fe252aa91994765f9fbe95f9a077a67d56255dcd6645776ab997d)
Reusing dataset gigaspeech (/home/sanchit_huggingface_co/.cache/huggingface/datasets/speechcolab___gigaspeech/l/0.0.0/0db31224ad43470c71b459deb2f2b40956b3a4edfde5fb313aaec69ec7b50d3c)
Using custom data configuration sanchit-gandhi--earnings22_robust_

In [3]:
def error_correction(datasets, ds_name, transcript_column_names, id_column_names, do_lower_cases):
    """
    Function to:
    - Rename text column name to a uniform name `text` (useful for subsequent analyses)
    - Rename id column name to a uniform name `id`
    - Filter samples not included for training/eval (those with 'junk' tokens only)
    - ERSR error correction on a dataset-by-dataset basis
    - Filter samples by a zero audio/text length criterion
    - Print num samples and total hours for each dataset
    
    Returns:
    - Error corrected list of datasets
    """
    for i, ds in enumerate(datasets):
        dataset_name = ds_name[i]
        text_column_name = transcript_column_names[i]
        id_column_name = id_column_names[i]
        do_lower_case = do_lower_cases[i]

        if text_column_name != "text":
            ds = ds.rename_column(text_column_name, "text")
        if id_column_name != "id":
            ds = ds.rename_column(id_column_name, "id")

        def is_target_labels(input_str):
            return input_str.lower() not in ignore_segments

        ds = ds.filter(is_target_labels, input_columns=["text"], desc="filtering text...")

        def prepare_dataset(batch):
            """Entirely follows ERSR error correction"""
            # Pre-process audio
            try:
                sample = batch["audio"]
            except ValueError:
                # E22: some samples are empty (no audio). Reading the empty audio array will trigger
                # a soundfile ValueError. For now, we'll manually set these arrays to a zero array.
                # They will be filtered in the subsequent filtering stage and so are
                # explicitly ignored during training.
                sample = {"array": np.array([0.]), "sampling_rate": 16000}

            # time in s
            batch["input_length"] = len(sample["array"]) / sample["sampling_rate"]

            # 'Error correction' of targets
            input_str = batch["text"].lower() if do_lower_case else batch["text"]
            # LibriSpeech ASR
            if "librispeech" in dataset_name:
                pass  # no error correction necessary

            # VoxPopuli
            if "voxpopuli" in dataset_name:
                pass  # no error correction necessary
            
            # Common Voice 9
            if "common_voice_9_0" in dataset_name:
                if input_str.startswith('"') and input_str.endswith('"'):
                    # we can remove trailing quotation marks as they do not affect the transcription
                    input_str = input_str[1:-1]
                # replace double quotation marks with single
                input_str = input_str.replace('""', '"')
            
            # TED-LIUM (Release 3)
            if "tedlium" in dataset_name:
                # delete the <unk> token from the text
                input_str = input_str.replace("<unk>", "")
                # replace spaced apostrophes with un-spaced (it 's -> it's)
                for contraction in tedlium_contractions:
                    input_str = input_str.replace(contraction, contraction[1:])
            
            # GigaSpeech
            if "gigaspeech" in dataset_name:
                for disfluency in gigaspeech_disfluencies:
                    input_str = input_str.replace(disfluency, "")
                # convert spelled out punctuation to symbolic form
                for punctuation, replacement in gigaspeech_punctuation.items():
                    input_str = input_str.replace(punctuation, replacement)
            
            # SWB: hide the path to the private HF dataset
            if "switchboard" in dataset_name:
                for disfluency in swb_disfluencies:
                    input_str = input_str.replace(disfluency, "")
                # remove parenthesised text (test data only)
                input_str = re.sub("[\(].*?[\)]", "", input_str)
                for punctuation in swb_punctuations:
                    input_str = input_str.replace(punctuation, "")
                # replace anomalous words with their correct transcriptions
                split_str = input_str.split("/")
                if len(split_str) > 1:
                    input_str = " ".join(
                        [" ".join([" ".join(i.split(" ")[:-1]) for i in split_str])] + [split_str[-1].split(" ")[-1]])
            
            # Earnings 22
            if "earnings22" in dataset_name:
                for disfluency in earnings_disfluencies:
                    input_str = input_str.replace(disfluency, "")
            
            # SPGISpeech
            if "spgispeech" in dataset_name:
                pass  # no error correction necessary
            
            # JIWER compliance (for WER/CER calc.)
            # remove multiple spaces
            input_str = re.sub(r"\s\s+", " ", input_str)
            # strip trailing spaces
            input_str = input_str.strip()

            batch["text"] = input_str
            batch["text_length"] = len(input_str.split(" "))
            return batch

        ds = ds.map(prepare_dataset, desc=f"pre-processing...", num_proc=1)

        def is_audio_empty(audio_length):
            # remove empty Earnigns22 audio samples (length = 1)
            return audio_length > 1

        ds = ds.filter(is_audio_empty, input_columns=["input_length"], desc="filtering audio...")

        def is_text_empty(words_length):
            return words_length > 0

        ds = ds.filter(is_text_empty, input_columns=["text_length"], desc="filtering text...")

        datasets[i] = ds

        print(100*"=")
        print(dataset_name)
        print("Num samples: ", len(ds))
        print("Total audio length: ", np.sum(ds["input_length"]) / 60 ** 2, "hours")

    return datasets

In [4]:
train_datasets = error_correction(train_datasets, ds_name, transcript_column_names, id_column_names, do_lower_cases)

Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-cec0e1532fb7dc2b.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-0dd3ed07ad374ed7.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-058ac256820953a0.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-1b44a2f13f251523.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/mozilla-foundation___common_voice_9_0/en/9.0.0/c8491634

librispeech
Num samples:  30
Total audio length:  0.10402222222222222 hours
common_voice_9_0
Num samples:  10
Total audio length:  0.01198 hours
voxpopuli
Num samples:  10
Total audio length:  0.04011559027777778 hours
tedlium
Num samples:  10
Total audio length:  0.018216666666666666 hours
gigaspeech
Num samples:  10
Total audio length:  0.009133333333333334 hours


Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/sanchit-gandhi___parquet/sanchit-gandhi--earnings22_robust_split-0404c6cc081bf5f0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8b1bf630ed50e578.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/sanchit-gandhi___parquet/sanchit-gandhi--earnings22_robust_split-0404c6cc081bf5f0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a9ffbc0d72f3ac12.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/kensho___spgispeech/L/1.0.0/9c55755e8cc1d73e7c24cd76053daa3737ca6d7b42c04fde14d026bd0dc12de0/cache-b2ba4bceae4b3055.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/kensho___spgispeech/L/1.0.0/9c55755e8cc1d73e7c24cd76053daa3737ca6d7b42c04fde14d026bd0dc12de0/cache-a4fb1c2d2a7533c2.arrow
Loading cached proce

earnings22
Num samples:  9
Total audio length:  0.016061163194444446 hours
spgispeech
Num samples:  10
Total audio length:  0.024291666666666663 hours
switchboard
Num samples:  4
Total audio length:  0.005201180555555555 hours


### Load all dev/test sets

We do this on a dev/test split-by-split basis to get the size (hours) of each split, info we require for the ERSR paper.

In [5]:
librispeech_dev_clean = load_dataset("librispeech_asr", "all", split="validation.clean[:10]", use_auth_token=True)
librispeech_dev_other = load_dataset("librispeech_asr", "all", split="validation.other[:10]", use_auth_token=True)
librispeech_test_clean = load_dataset("librispeech_asr", "all", split="test.clean[:10]", use_auth_token=True)
librispeech_test_other = load_dataset("librispeech_asr", "all", split="test.other[:10]", use_auth_token=True)

common_voice_9_0_dev = load_dataset("mozilla-foundation/common_voice_9_0", "en", split="validation[:10]", use_auth_token=True)
common_voice_9_0_test = load_dataset("mozilla-foundation/common_voice_9_0", "en", split="test[:10]", use_auth_token=True)

voxpopuli_dev = load_dataset("google/xtreme_s", "voxpopuli.en", split="validation[:10]", use_auth_token=True)
voxpopuli_test = load_dataset("google/xtreme_s", "voxpopuli.en", split="test[:10]", use_auth_token=True)

tedlium_dev = load_dataset("LIUM/tedlium", "release3", split="validation[:10]", use_auth_token=True)
tedlium_test = load_dataset("LIUM/tedlium", "release3", split="test[:10]", use_auth_token=True)

gigaspeech_dev = load_dataset("speechcolab/gigaspeech", "l", split="validation[:10]", use_auth_token=True)
gigaspeech_test = load_dataset("speechcolab/gigaspeech", "l", split="test[:10]", use_auth_token=True)

earnings22_dev = load_dataset("sanchit-gandhi/earnings22_robust_split", split="validation[:10]", use_auth_token=True)
earnings22_test = load_dataset("sanchit-gandhi/earnings22_robust_split", split="test[:10]", use_auth_token=True)

spgispeech_dev = load_dataset("kensho/spgispeech", "L", split="validation[:10]", use_auth_token=True, revision="f4d7d3b3f9b66414a09532ec937e285197afeaf6")
spgispeech_test = load_dataset("kensho/spgispeech", "L", split="test[:10]", use_auth_token=True, revision="f4d7d3b3f9b66414a09532ec937e285197afeaf6")

switchboard_test = load_dataset("ldc/switchboard", "switchboard", split="test.switchboard[:10]", use_auth_token=True)
callhome_test = load_dataset("ldc/switchboard", "switchboard", split="test.callhome[:10]", use_auth_token=True)

dev_ds = [librispeech_dev_clean, librispeech_dev_other, common_voice_9_0_dev, voxpopuli_dev, tedlium_dev, gigaspeech_dev, earnings22_dev, spgispeech_dev, switchboard_test]
dev_name = ["librispeech_asr/validation.clean", "librispeech_asr/validation.other", "common_voice_9_0/validation", "voxpopuli/validation", "tedlium/validation", "gigaspeech/validation", "earnings22/validation", "spgispeech/validation", "switchboard/test"]

test_ds = [librispeech_test_clean, librispeech_test_other, common_voice_9_0_test, voxpopuli_test, tedlium_test, gigaspeech_test, earnings22_test, spgispeech_test, callhome_test]
test_name = ["librispeech_asr/test.clean", "librispeech_asr/test.other", "common_voice_9_0/test", "voxpopuli/test", "tedlium/test", "gigaspeech/test", "earnings22/test", "spgispeech/test", "switchboard/callhome"]

# we now have two LS dev & test sets (clean/other) -> update transcript/id column names accordingly by repeating the first entry
dev_transcript_column_names = [transcript_column_names[0], *transcript_column_names]
dev_id_column_names = [id_column_names[0], *id_column_names]
dev_do_lower_cases = [do_lower_cases[0], *do_lower_cases]

Reusing dataset librispeech_asr (/home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb)
Reusing dataset librispeech_asr (/home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb)
Reusing dataset librispeech_asr (/home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb)
Reusing dataset librispeech_asr (/home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb)
Reusing dataset common_voice_9_0 (/home/sanchit_huggingface_co/.cache/huggingface/datasets/mozilla-foundation___common_voice_9_0/en/9.0.0/c8491634a4579fef5745ab949ee9aa4265b7203d7e2ecf44f45879a6419cd40d)
Reusing dataset common_voice_9_0 (/home/sanchit_huggingface_co/.cach

In [6]:
# error correction of dev/test sets
dev_ds = error_correction(dev_ds, dev_name, dev_transcript_column_names, dev_id_column_names, dev_do_lower_cases)
test_ds = error_correction(test_ds, test_name, dev_transcript_column_names, dev_id_column_names, dev_do_lower_cases)

Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-a0c14a2469e17a91.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-b5044f8164ec061b.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-a2f6ad785a823753.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a56980dbb808f3fc56f5a3d56b18ee88458eb/cache-187e4e41d70d6342.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/librispeech_asr/all/2.1.0/14c8bffddb861b4b3a4fcdff648a5

librispeech_asr/validation.clean
Num samples:  10
Total audio length:  0.0144625 hours
librispeech_asr/validation.other
Num samples:  10
Total audio length:  0.013333333333333336 hours
common_voice_9_0/validation
Num samples:  10
Total audio length:  0.012313333333333334 hours
voxpopuli/validation
Num samples:  10
Total audio length:  0.023910972222222222 hours
tedlium/validation
Num samples:  7
Total audio length:  0.022930555555555558 hours
gigaspeech/validation
Num samples:  8


Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/sanchit-gandhi___parquet/sanchit-gandhi--earnings22_robust_split-0404c6cc081bf5f0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-8cbe3291d5aaabd2.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/sanchit-gandhi___parquet/sanchit-gandhi--earnings22_robust_split-0404c6cc081bf5f0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-0f2e3014e0ad72b0.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/sanchit-gandhi___parquet/sanchit-gandhi--earnings22_robust_split-0404c6cc081bf5f0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-cb7bb1437e99ac41.arrow
Loading cached processed dataset at /home/sanchit_huggingface_co/.cache/huggingface/datasets/sanchit-gandhi___parquet/sanchit-gandhi--earnings22_robust_split-0404c6cc081bf5f0/

Total audio length:  0.01107138888888889 hours
earnings22/validation
Num samples:  9
Total audio length:  0.019405711805555555 hours


filtering text...: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 75.76ba/s]
pre-processing...: 100%|████████████████████████████████████████████████████████████████| 10/10 [00:12<00:00,  1.27s/ex]
filtering audio...: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 452.36ba/s]
filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 409.28ba/s]


spgispeech/validation
Num samples:  10
Total audio length:  0.02225833333333333 hours


filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 281.42ba/s]
pre-processing...: 100%|███████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 726.53ex/s]
filtering audio...: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 408.80ba/s]
filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 548.63ba/s]


switchboard/test
Num samples:  9
Total audio length:  0.010027777777777776 hours


filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 517.05ba/s]
pre-processing...: 100%|███████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 237.31ex/s]
filtering audio...: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 463.15ba/s]
filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 390.57ba/s]


librispeech_asr/test.clean
Num samples:  10
Total audio length:  0.025423611111111112 hours


filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 423.75ba/s]
pre-processing...: 100%|███████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 337.09ex/s]
filtering audio...: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 466.76ba/s]
filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 429.30ba/s]


librispeech_asr/test.other
Num samples:  10
Total audio length:  0.014059722222222223 hours


filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 279.17ba/s]
pre-processing...: 100%|████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 77.28ex/s]
filtering audio...: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 490.56ba/s]
filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 395.95ba/s]


common_voice_9_0/test
Num samples:  10
Total audio length:  0.015833333333333335 hours


filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 279.19ba/s]
pre-processing...: 100%|████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.30ex/s]
filtering audio...: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 512.75ba/s]
filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 416.31ba/s]


voxpopuli/test
Num samples:  10
Total audio length:  0.031192447916666664 hours


filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 283.25ba/s]
pre-processing...: 100%|█████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 257.11ex/s]
filtering audio...: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 468.74ba/s]
filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 404.43ba/s]


tedlium/test
Num samples:  6
Total audio length:  0.015338402777777779 hours


filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 442.62ba/s]
pre-processing...: 100%|██████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00,  9.64ex/s]
filtering audio...: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 545.85ba/s]
filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 290.93ba/s]


gigaspeech/test
Num samples:  8
Total audio length:  0.02004027777777778 hours


filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 286.99ba/s]
  array = librosa.resample(array, sampling_rate, self.sampling_rate, res_type="kaiser_best")
  array = librosa.resample(array, sampling_rate, self.sampling_rate, res_type="kaiser_best")
  array = librosa.resample(array, sampling_rate, self.sampling_rate, res_type="kaiser_best")
  array = librosa.resample(array, sampling_rate, self.sampling_rate, res_type="kaiser_best")
pre-processing...: 100%|████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 42.36ex/s]
filtering audio...: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 606.90ba/s]
filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 385.29ba/s]


earnings22/test
Num samples:  8
Total audio length:  0.016889114583333333 hours


filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 508.22ba/s]
pre-processing...: 100%|████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.68ex/s]
filtering audio...: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 478.15ba/s]
filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 413.11ba/s]


spgispeech/test
Num samples:  10
Total audio length:  0.021283333333333335 hours


filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 239.92ba/s]
pre-processing...: 100%|███████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 901.79ex/s]
filtering audio...: 100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 539.04ba/s]
filtering text...: 100%|█████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 452.70ba/s]

switchboard/callhome
Num samples:  8
Total audio length:  0.006383333333333334 hours





### Combine datasets for accumulated statistics (train-dev-test)

In [7]:
# LS has two dev/test sets (clean/other) -> treat separately
librispeech_all = concatenate_datasets([train_datasets[0], dev_ds[0], dev_ds[1], test_ds[0], test_ds[1]])
# rule based approach for combining remaineder of datasets: combine train with dev and test
all_datasets = [concatenate_datasets([train_datasets[i-1], dev_ds[i], test_ds[i]]) for i in range(2, len(dev_name))]
# append LS
all_datasets = [librispeech_all, * all_datasets]

In [8]:
for i in range(len(all_datasets)):
    print(ds_name[i], all_datasets[i]) 

librispeech Dataset({
    features: ['file', 'audio', 'text', 'speaker_id', 'chapter_id', 'id', 'input_length', 'text_length'],
    num_rows: 70
})
common_voice_9_0 Dataset({
    features: ['id', 'path', 'audio', 'text', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'input_length', 'text_length'],
    num_rows: 30
})
voxpopuli Dataset({
    features: ['id', 'path', 'audio', 'text', 'lang_id', 'input_length', 'text_length'],
    num_rows: 30
})
tedlium Dataset({
    features: ['audio', 'text', 'speaker_id', 'gender', 'file', 'id', 'input_length', 'text_length'],
    num_rows: 23
})
gigaspeech Dataset({
    features: ['id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path', 'input_length', 'text_length'],
    num_rows: 26
})
earnings22 Dataset({
    features: ['id', 'audio', 'segment_id', 'text', 'start_ts', 'end_ts', 'input_length', 'text_length'],
    num_rows: 26
})
spgispeech Datase

Great! Now that we've combined our train/dev/test splits, we can run a combined analysis over all splits. The column names have also been normalised `(audio, input_length, text, text_length)` for our convinience.

In [10]:
for i in range(len(all_datasets)):
    print(ds_name[i])
    ds = all_datasets[i]
    print(f"Mean sample duration: {np.mean(ds['input_length'])} +- {np.std(ds['input_length'])} s")
    print(f"Mean transcript length: {np.mean(ds['text_length'])} +- {np.std(ds['text_length'])} words")

librispeech
Mean sample duration: 8.809785714285715 +- 5.019140111023166 s
Mean transcript length: 24.97142857142857 +- 13.094133286728535 words
common_voice_9_0
Mean sample duration: 4.8152 +- 2.021359384176896 s
Mean transcript length: 8.566666666666666 +- 4.047083998249714 words
voxpopuli
Mean sample duration: 11.426281249999999 +- 5.961289670369112 s
Mean transcript length: 29.333333333333332 +- 12.242911781471307 words
tedlium
Mean sample duration: 8.841228260869565 +- 3.5927036589522867 s
Mean transcript length: 22.782608695652176 +- 13.484150185984479 words
gigaspeech
Mean sample duration: 5.572384615384616 +- 3.4397609384388135 s
Mean transcript length: 17.423076923076923 +- 13.255399703584013 words
earnings22
Mean sample duration: 7.249290865384617 +- 5.429852291244234 s
Mean transcript length: 17.307692307692307 +- 14.488109795407645 words
spgispeech
Mean sample duration: 8.14 +- 2.532520483628909 s
Mean transcript length: 20.1 +- 8.162311764036788 words
switchboard
Mean samp