# Prepare Mozilla CommonVoice CS data for DeepSpeech2

## Dowload data
https://commonvoice.mozilla.org/cs/datasets

untar

## Install deps
progressbar2
sox


In [3]:
import csv
import os
import subprocess
import unicodedata
from multiprocessing import Pool
import progressbar
import sox
from collections import Counter

data_path = 'commonvoice/cv-corpus-6.1-2020-12-11/cs'

FIELDNAMES = ["wav_filename", "wav_filesize", "transcript"]
SAMPLE_RATE = 16000
CHANNELS = 1
MAX_SECS = 10
PARAMS = None
FILTER_OBJ = None
SIMPLE_BAR = ['Progress ', progressbar.Bar(), ' ', progressbar.Percentage(), ' completed']

In [None]:
def get_counter():
    return Counter({'all': 0, 'failed': 0, 'invalid_label': 0, 'too_short': 0, 'too_long': 0, 'imported_time': 0, 'total_time': 0})


def init_worker(params):
    global FILTER_OBJ  # pylint: disable=global-statement
    validate_label = get_validate_label(params)
    alphabet = Alphabet(params.filter_alphabet) if params.filter_alphabet else None
    FILTER_OBJ = LabelFilter(params.normalize, alphabet, validate_label)

def get_validate_label(args):
    """
    Expects an argparse.Namespace argument to search for validate_label_locale parameter.
    If found, this will modify Python's library search path and add the directory of the
    file pointed by the validate_label_locale argument.
    :param args: The importer's CLI argument object
    :type args: argparse.Namespace
    :return: The user-supplied validate_label function
    :type: function
    """
    # Python 3.5 does not support passing a pathlib.Path to os.path.* methods
    if 'validate_label_locale' not in args or (args.validate_label_locale is None):
        print('WARNING: No --validate_label_locale specified, your might end with inconsistent dataset.')
        return validate_label_eng
    validate_label_locale = str(args.validate_label_locale)
    if not os.path.exists(os.path.abspath(validate_label_locale)):
        print('ERROR: Inexistent --validate_label_locale specified. Please check.')
        return None
    module_dir = os.path.abspath(os.path.dirname(validate_label_locale))
    sys.path.insert(1, module_dir)
    fname = os.path.basename(validate_label_locale).replace('.py', '')
    locale_module = importlib.import_module(fname, package=None)
    return locale_module.validate_label


In [9]:
def _maybe_convert_set(dataset, tsv_dir, audio_dir, filter_obj, space_after_every_character=None, rows=None, exclude=None):
    exclude_transcripts = set()
    exclude_speakers = set()
    if exclude is not None:
        for sample in exclude:
            exclude_transcripts.add(sample[2])
            exclude_speakers.add(sample[3])

    if rows is None:
        rows = []
        input_tsv = os.path.join(os.path.abspath(tsv_dir), dataset + ".tsv")
        if not os.path.isfile(input_tsv):
            return rows
        print("Loading TSV file: ", input_tsv)
        # Get audiofile path and transcript for each sentence in tsv
        samples = []
        with open(input_tsv, encoding="utf-8") as input_tsv_file:
            reader = csv.DictReader(input_tsv_file, delimiter="\t")
            idx = 0
            for row in reader:
                samples.append((os.path.join(audio_dir, row["path"]), row["sentence"], row["client_id"]))
                idx += 1
                if idx > 5:
                    break

        counter = get_counter()
        num_samples = len(samples)

        print("Importing mp3 files...")
        pool = Pool(initializer=init_worker, initargs=(PARAMS,))
        bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR)
        for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1):
            counter += processed[0]
            rows += processed[1]
            bar.update(i)
        bar.update(num_samples)
        pool.close()
        pool.join()

        imported_samples = get_imported_samples(counter)
        assert counter["all"] == num_samples
        assert len(rows) == imported_samples
        print_import_report(counter, SAMPLE_RATE, MAX_SECS)

    output_csv = os.path.join(os.path.abspath(audio_dir), dataset + ".csv")
    print("Saving new DeepSpeech-formatted CSV file to: ", output_csv)
    with open(output_csv, "w", encoding="utf-8", newline="") as output_csv_file:
        print("Writing CSV file for DeepSpeech.py as: ", output_csv)
        writer = csv.DictWriter(output_csv_file, fieldnames=FIELDNAMES)
        writer.writeheader()
        bar = progressbar.ProgressBar(max_value=len(rows), widgets=SIMPLE_BAR)
        for filename, file_size, transcript, speaker in bar(rows):
            if transcript in exclude_transcripts or speaker in exclude_speakers:
                continue
            if space_after_every_character:
                writer.writerow(
                    {
                        "wav_filename": filename,
                        "wav_filesize": file_size,
                        "transcript": " ".join(transcript),
                    }
                )
            else:
                writer.writerow(
                    {
                        "wav_filename": filename,
                        "wav_filesize": file_size,
                        "transcript": transcript,
                    }
                )
    return rows


In [7]:
def _maybe_convert_wav(mp3_filename, wav_filename):
    if not os.path.exists(wav_filename):
        transformer = sox.Transformer()
        transformer.convert(samplerate=SAMPLE_RATE, n_channels=CHANNELS)
        try:
            transformer.build(mp3_filename, wav_filename)
        except sox.core.SoxError:
            pass

In [5]:
space_after_every_character=False
tsv_dir = data_path
audio_dir = data_path + "/clips"
exclude = []
for dataset in ["test", "dev", "train", "validated", "other"]:
    set_samples = _maybe_convert_set(dataset, tsv_dir, audio_dir, space_after_every_character)
    if dataset in ["test", "dev"]:
        exclude += set_samples
    if dataset == "validated":
        _maybe_convert_set("train-all", tsv_dir, audio_dir, space_after_every_character,
                            rows=set_samples, exclude=exclude)

