In [26]:
VERSION = "cv-corpus-6.1-2020-12-11"
LANGUAGE = "en"

In [8]:
import librosa
import argparse
import csv
import json
import logging
import multiprocessing
import os
import subprocess
import sys
import tarfile
from multiprocessing.pool import ThreadPool
from pathlib import Path
from typing import List
import yaml

import sox
from sox import Transformer
from tqdm import tqdm
import pandas as pd
import numpy as np

In [5]:
validated = pd.read_csv("small/CV_unpacked/cv-corpus-6.1-2020-12-11/en/validated.tsv",sep="\t")

array([1016262,  315571,  940842, ..., 1176911,  185601,  637011])

In [27]:
ln = validated.shape[0]
train_ix = np.random.choice(ln,1000,replace=False)
idx2 = validated.index.difference(train_ix)
test_ix = np.random.choice(idx2,200,replace=False)
idx3 = validated.index.difference(np.concatenate([test_ix,train_ix]))
val_ix = np.random.choice(idx3,100,replace=False)


In [31]:
validated['path'].loc[train_ix].to_list()

['common_voice_en_18446803.mp3',
 'common_voice_en_118293.mp3',
 'common_voice_en_18680774.mp3',
 'common_voice_en_18148664.mp3',
 'common_voice_en_19653888.mp3',
 'common_voice_en_19801158.mp3',
 'common_voice_en_18733056.mp3',
 'common_voice_en_22757364.mp3',
 'common_voice_en_76526.mp3',
 'common_voice_en_18713984.mp3',
 'common_voice_en_206746.mp3',
 'common_voice_en_17727459.mp3',
 'common_voice_en_17262464.mp3',
 'common_voice_en_17279616.mp3',
 'common_voice_en_665458.mp3',
 'common_voice_en_18006898.mp3',
 'common_voice_en_17577353.mp3',
 'common_voice_en_20219398.mp3',
 'common_voice_en_47424.mp3',
 'common_voice_en_18709346.mp3',
 'common_voice_en_696831.mp3',
 'common_voice_en_21852581.mp3',
 'common_voice_en_674460.mp3',
 'common_voice_en_22276123.mp3',
 'common_voice_en_142335.mp3',
 'common_voice_en_18941920.mp3',
 'common_voice_en_23879244.mp3',
 'common_voice_en_21051586.mp3',
 'common_voice_en_22912101.mp3',
 'common_voice_en_18887380.mp3',
 'common_voice_en_19350600.m

In [33]:


parser = argparse.ArgumentParser(description='Downloads and processes Mozilla Common Voice dataset.')
parser.add_argument("--data_root", default='small/', type=str, help="Directory to store the dataset.")
parser.add_argument('--manifest_dir', default='./', type=str,help='Output directory for manifests')
parser.add_argument("--num_workers", default=multiprocessing.cpu_count(), type=int, help="Workers to process dataset.")
parser.add_argument('--sample_rate', default=16000, type=int, help='Sample rate')
parser.add_argument('--n_channels', default=1, type=int, help='Number of channels for output wav files')
parser.add_argument(
    '--files_to_process',
    nargs='+',
    default=['test.tsv', 'dev.tsv', 'train.tsv'],
    type=str,
    help='list of *.csv file names to process',
    required=False
)
parser.add_argument(
    '--version',
    default='cv-corpus-5.1-2020-06-22',
    type=str,
    required=False,
    help='Version of the dataset (obtainable via https://commonvoice.mozilla.org/en/datasets',
)
parser.add_argument(
    '--language',
    default='en',
    type=str,
    required=False,
    help='Which language to download.(default english,'
    'check https://commonvoice.mozilla.org/en/datasets for more language codes',
)
sys.argv = ['-f']
args = parser.parse_args(['--version',VERSION,'--files_to_process','validated.tsv'])

COMMON_VOICE_URL = (
    f"https://voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com/"
    "{}/{}.tar.gz".format(args.version, args.language)
)



def create_manifest(data: List[tuple], output_name: str, manifest_path: str):
    output_file = Path(manifest_path) / output_name
    output_file.parent.mkdir(exist_ok=True, parents=True)

    with output_file.open(mode='w') as f:
        for wav_path, duration, text in tqdm(data, total=len(data)):
            if wav_path:
              f.write(
                  json.dumps({'audio_filepath': os.path.abspath(wav_path), "duration": duration, 'text': text}) + '\n'
              )


def process_files(csv_file, data_root, num_workers):
    """ Read *.csv file description, convert mp3 to wav, process text.
        Save results to data_root.
    Args:
        csv_file: str, path to *.csv file with data description, usually start from 'cv-'
        data_root: str, path to dir to save results; wav/ dir will be created
    """
    wav_dir = os.path.join(data_root, 'wav/')
    os.makedirs(wav_dir, exist_ok=True)
    audio_clips_path = "/Users/dami.osoba/Downloads/cv-corpus-6.1-2020-12-11/en/clips/"


    def process(x):
        file_path, text = x
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        text = text.lower().strip()
        audio_path = os.path.join(audio_clips_path, file_path)
        output_wav_path = os.path.join(wav_dir, file_name + '.wav')
        tfm = Transformer()
        tfm.rate(samplerate=args.sample_rate)
        tfm.channels(n_channels=args.n_channels)
        # tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
        os.system(
        f'ffmpeg -i {audio_path} -ar {args.sample_rate} {output_wav_path} -y'
    )
        print(output_wav_path)
        duration = librosa.get_duration(filename=output_wav_path)
        return output_wav_path, duration, text


    logging.info('Converting mp3 to wav for {}.'.format(csv_file))
    with open(csv_file) as csvfile:
        reader = csv.DictReader(csvfile, delimiter='\t')
        next(reader, None)  # skip the headers
#         curated = os.listdir(f'{audio_clips_path}')
        curated = validated['path'].loc[test_ix].to_list()
        data = [(row['path'], row['sentence']) for row in reader if row['path'] in curated] 
        with ThreadPool(num_workers) as pool:
            data = list(tqdm(pool.imap(process, data), total=len(data)))
    return data


def main():
    data_root = args.data_root
    os.makedirs(data_root, exist_ok=True)

    target_unpacked_dir = os.path.join(data_root, "CV_unpacked")

    if os.path.exists(target_unpacked_dir):
        logging.info('Find existing folder {}'.format(target_unpacked_dir))
    else:
        logging.info("Could not find Common Voice, Downloading corpus...")

        commands = [
            'wget',
            '--user-agent',
            '"Mozilla/5.0 (Windows NT 10.0; WOW64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"',
            '-P',
            data_root,
            f'{COMMON_VOICE_URL}',
        ]
        commands = " ".join(commands)
        subprocess.run(commands, shell=True, stderr=sys.stderr, stdout=sys.stdout, capture_output=False)
        filename = f"{args.language}.tar.gz"
        target_file = os.path.join(data_root, os.path.basename(filename))

        os.makedirs(target_unpacked_dir, exist_ok=True)
        logging.info("Unpacking corpus to {} ...".format(target_unpacked_dir))
        tar = tarfile.open(target_file)
        tar.extractall(target_unpacked_dir)
        tar.close()

    folder_path = os.path.join(target_unpacked_dir, args.version + f'/{args.language}/')

    for csv_file in args.files_to_process:
        data = process_files(
            csv_file=os.path.join(folder_path, csv_file),
            data_root=os.path.join(data_root, 'test'),
            num_workers=args.num_workers,
        )
        logging.info('Creating manifests...')
        create_manifest(
            data=data,
#             output_name=f'commonvoice_{os.path.splitext(csv_file)[0]}_manifest.json',
            output_name=f'commonvoice_test_manifest.json',

            manifest_path=args.manifest_dir,
        )


if __name__ == "__main__":
    main()


  2%|▏         | 3/200 [00:00<00:14, 13.34it/s]

small/test/wav/common_voice_en_20275923.wav
small/test/wav/common_voice_en_505749.wav
small/test/wav/common_voice_en_20390368.wav
small/test/wav/common_voice_en_17390894.wav


  4%|▍         | 8/200 [00:00<00:10, 18.09it/s]

small/test/wav/common_voice_en_19698716.wav
small/test/wav/common_voice_en_691734.wav
small/test/wav/common_voice_en_19845624.wav
small/test/wav/common_voice_en_18050947.wav
small/test/wav/common_voice_en_72985.wav


  6%|▋         | 13/200 [00:00<00:09, 19.11it/s]

small/test/wav/common_voice_en_19636176.wav
small/test/wav/common_voice_en_20737678.wav
small/test/wav/common_voice_en_20018611.wav
small/test/wav/common_voice_en_19709438.wav
small/test/wav/common_voice_en_18708720.wav

  8%|▊         | 16/200 [00:00<00:09, 19.48it/s]


small/test/wav/common_voice_en_8512499.wav
small/test/wav/common_voice_en_91516.wav
small/test/wav/common_voice_en_568978.wav
small/test/wav/common_voice_en_120312.wav


 10%|█         | 21/200 [00:01<00:09, 19.49it/s]

small/test/wav/common_voice_en_101661.wav
small/test/wav/common_voice_en_206412.wav
small/test/wav/common_voice_en_22129997.wav
small/test/wav/common_voice_en_22141513.wav


 13%|█▎        | 26/200 [00:01<00:08, 19.93it/s]

small/test/wav/common_voice_en_694977.wav
small/test/wav/common_voice_en_487100.wav
small/test/wav/common_voice_en_42498.wav
small/test/wav/common_voice_en_134088.wav
small/test/wav/common_voice_en_16197.wav


 15%|█▌        | 30/200 [00:01<00:08, 19.79it/s]

small/test/wav/common_voice_en_20245376.wav
small/test/wav/common_voice_en_18441982.wav
small/test/wav/common_voice_en_22762318.wav
small/test/wav/common_voice_en_667428.wav
small/test/wav/common_voice_en_59237.wav


 18%|█▊        | 36/200 [00:01<00:08, 20.18it/s]

small/test/wav/common_voice_en_93655.wav
small/test/wav/common_voice_en_18060595.wav
small/test/wav/common_voice_en_18855265.wav
small/test/wav/common_voice_en_24000046.wav
small/test/wav/common_voice_en_20678472.wav


 20%|█▉        | 39/200 [00:02<00:08, 19.85it/s]

small/test/wav/common_voice_en_22184280.wav
small/test/wav/common_voice_en_18774656.wav
small/test/wav/common_voice_en_1366.wav
small/test/wav/common_voice_en_192227.wav


 22%|██▎       | 45/200 [00:02<00:07, 19.95it/s]

small/test/wav/common_voice_en_18851558.wav
small/test/wav/common_voice_en_66273.wav
small/test/wav/common_voice_en_544863.wav
small/test/wav/common_voice_en_20663259.wav
small/test/wav/common_voice_en_606015.wav


 26%|██▌       | 51/200 [00:02<00:07, 20.12it/s]

small/test/wav/common_voice_en_21928116.wav
small/test/wav/common_voice_en_19350135.wav
small/test/wav/common_voice_en_91568.wav
small/test/wav/common_voice_en_19381294.wav
small/test/wav/common_voice_en_566050.wav


 27%|██▋       | 54/200 [00:02<00:07, 19.87it/s]

small/test/wav/common_voice_en_20062000.wav
small/test/wav/common_voice_en_176696.wav
small/test/wav/common_voice_en_23367445.wav
small/test/wav/common_voice_en_216166.wav
small/test/wav/common_voice_en_66015.wav


 30%|███       | 60/200 [00:03<00:06, 20.00it/s]

small/test/wav/common_voice_en_19746103.wav
small/test/wav/common_voice_en_68794.wav
small/test/wav/common_voice_en_17792515.wav
small/test/wav/common_voice_en_22247842.wav
small/test/wav/common_voice_en_557247.wav


 32%|███▏      | 63/200 [00:03<00:06, 20.09it/s]

small/test/wav/common_voice_en_641222.wav
small/test/wav/common_voice_en_615902.wav
small/test/wav/common_voice_en_1883203.wav
small/test/wav/common_voice_en_125449.wav
small/test/wav/common_voice_en_656015.wav


 34%|███▍      | 69/200 [00:03<00:06, 20.03it/s]

small/test/wav/common_voice_en_85095.wav
small/test/wav/common_voice_en_532467.wav
small/test/wav/common_voice_en_19114951.wav
small/test/wav/common_voice_en_6215464.wav
small/test/wav/common_voice_en_8237483.wav


 38%|███▊      | 75/200 [00:03<00:06, 20.04it/s]

small/test/wav/common_voice_en_213599.wav
small/test/wav/common_voice_en_17476390.wav
small/test/wav/common_voice_en_19180401.wav
small/test/wav/common_voice_en_142706.wav
small/test/wav/common_voice_en_17393682.wav


 40%|████      | 80/200 [00:04<00:06, 19.86it/s]

small/test/wav/common_voice_en_17393699.wav
small/test/wav/common_voice_en_148780.wav
small/test/wav/common_voice_en_678688.wav
small/test/wav/common_voice_en_20007416.wav


 42%|████▏     | 83/200 [00:04<00:05, 19.85it/s]

small/test/wav/common_voice_en_8496332.wav
small/test/wav/common_voice_en_18782120.wav
small/test/wav/common_voice_en_20243279.wav
small/test/wav/common_voice_en_173515.wav
small/test/wav/common_voice_en_27257.wav


 44%|████▍     | 89/200 [00:04<00:05, 20.04it/s]

small/test/wav/common_voice_en_23711368.wav
small/test/wav/common_voice_en_194933.wav
small/test/wav/common_voice_en_22351910.wav
small/test/wav/common_voice_en_19744911.wav


 46%|████▌     | 92/200 [00:04<00:05, 20.06it/s]

small/test/wav/common_voice_en_534393.wav
small/test/wav/common_voice_en_156834.wav
small/test/wav/common_voice_en_125908.wav
small/test/wav/common_voice_en_216256.wav
small/test/wav/common_voice_en_693246.wav


 49%|████▉     | 98/200 [00:04<00:05, 20.00it/s]

small/test/wav/common_voice_en_607076.wav
small/test/wav/common_voice_en_18684754.wav
small/test/wav/common_voice_en_22091042.wav
small/test/wav/common_voice_en_21419602.wav
small/test/wav/common_voice_en_19696746.wav


 52%|█████▏    | 103/200 [00:05<00:04, 20.20it/s]

small/test/wav/common_voice_en_21696933.wav
small/test/wav/common_voice_en_21179504.wav
small/test/wav/common_voice_en_12516773.wav
small/test/wav/common_voice_en_17343913.wav
small/test/wav/common_voice_en_19912006.wav


 55%|█████▍    | 109/200 [00:05<00:04, 20.13it/s]

small/test/wav/common_voice_en_21803601.wav
small/test/wav/common_voice_en_20247540.wav
small/test/wav/common_voice_en_21985450.wav
small/test/wav/common_voice_en_152811.wav
small/test/wav/common_voice_en_484986.wav


 56%|█████▌    | 112/200 [00:05<00:04, 20.01it/s]

small/test/wav/common_voice_en_93348.wav
small/test/wav/common_voice_en_537678.wav
small/test/wav/common_voice_en_24031677.wav
small/test/wav/common_voice_en_21927570.wav
small/test/wav/common_voice_en_18405224.wav


 59%|█████▉    | 118/200 [00:05<00:04, 19.90it/s]

small/test/wav/common_voice_en_23932454.wav
small/test/wav/common_voice_en_19623584.wav
small/test/wav/common_voice_en_649792.wav
small/test/wav/common_voice_en_17276136.wav
small/test/wav/common_voice_en_19958525.wav


 62%|██████▏   | 124/200 [00:06<00:03, 20.07it/s]

small/test/wav/common_voice_en_17505563.wav
small/test/wav/common_voice_en_650184.wav
small/test/wav/common_voice_en_19770378.wav
small/test/wav/common_voice_en_19765914.wav
small/test/wav/common_voice_en_17947906.wav


 64%|██████▎   | 127/200 [00:06<00:04, 18.21it/s]

small/test/wav/common_voice_en_22049663.wav
small/test/wav/common_voice_en_19248967.wav
small/test/wav/common_voice_en_18576865.wav
small/test/wav/common_voice_en_23930267.wav


 66%|██████▌   | 132/200 [00:06<00:03, 19.00it/s]

small/test/wav/common_voice_en_22091908.wav
small/test/wav/common_voice_en_21267489.wav
small/test/wav/common_voice_en_19159439.wav
small/test/wav/common_voice_en_625322.wav
small/test/wav/common_voice_en_17731397.wav


 68%|██████▊   | 137/200 [00:06<00:03, 19.46it/s]

small/test/wav/common_voice_en_17288685.wav
small/test/wav/common_voice_en_53692.wav
small/test/wav/common_voice_en_24048495.wav
small/test/wav/common_voice_en_21918940.wav
small/test/wav/common_voice_en_17391601.wav


 71%|███████   | 142/200 [00:07<00:02, 19.68it/s]

small/test/wav/common_voice_en_426246.wav
small/test/wav/common_voice_en_17390563.wav
small/test/wav/common_voice_en_22707145.wav
small/test/wav/common_voice_en_18975644.wav
small/test/wav/common_voice_en_629553.wav


 73%|███████▎  | 146/200 [00:07<00:03, 17.87it/s]

small/test/wav/common_voice_en_23861545.wav
small/test/wav/common_voice_en_18738210.wav
small/test/wav/common_voice_en_18454853.wav
small/test/wav/common_voice_en_23978.wav


 75%|███████▌  | 150/200 [00:07<00:02, 18.55it/s]

small/test/wav/common_voice_en_121667.wav
small/test/wav/common_voice_en_18882677.wav
small/test/wav/common_voice_en_21338040.wav
small/test/wav/common_voice_en_499045.wav
small/test/wav/common_voice_en_562216.wav


 78%|███████▊  | 156/200 [00:07<00:02, 19.46it/s]

small/test/wav/common_voice_en_579444.wav
small/test/wav/common_voice_en_23935101.wav
small/test/wav/common_voice_en_18767512.wav
small/test/wav/common_voice_en_609222.wav
small/test/wav/common_voice_en_23835329.wav


 80%|███████▉  | 159/200 [00:08<00:02, 19.44it/s]

small/test/wav/common_voice_en_317165.wav
small/test/wav/common_voice_en_17884811.wav
small/test/wav/common_voice_en_18606474.wav
small/test/wav/common_voice_en_520926.wav


 82%|████████▎ | 165/200 [00:08<00:01, 19.92it/s]

small/test/wav/common_voice_en_19326848.wav
small/test/wav/common_voice_en_20522913.wav
small/test/wav/common_voice_en_18870834.wav
small/test/wav/common_voice_en_23650375.wav
small/test/wav/common_voice_en_18780326.wav


 84%|████████▍ | 169/200 [00:08<00:01, 19.66it/s]

small/test/wav/common_voice_en_22468748.wav
small/test/wav/common_voice_en_17922436.wav
small/test/wav/common_voice_en_22241608.wav
small/test/wav/common_voice_en_18662834.wav
small/test/wav/common_voice_en_21817574.wav


 87%|████████▋ | 174/200 [00:08<00:01, 19.94it/s]

small/test/wav/common_voice_en_21905878.wav
small/test/wav/common_voice_en_23881975.wav
small/test/wav/common_voice_en_20101736.wav
small/test/wav/common_voice_en_22323518.wav
small/test/wav/common_voice_en_21896771.wav


 90%|█████████ | 180/200 [00:09<00:00, 20.04it/s]

small/test/wav/common_voice_en_17917326.wav
small/test/wav/common_voice_en_18016424.wav
small/test/wav/common_voice_en_17888066.wav
small/test/wav/common_voice_en_146178.wav
small/test/wav/common_voice_en_20466979.wav


 92%|█████████▏| 183/200 [00:09<00:00, 19.89it/s]

small/test/wav/common_voice_en_17953525.wav
small/test/wav/common_voice_en_17330154.wav
small/test/wav/common_voice_en_20673684.wav
small/test/wav/common_voice_en_18296726.wav
small/test/wav/common_voice_en_17369572.wav


 94%|█████████▍| 189/200 [00:09<00:00, 19.90it/s]

small/test/wav/common_voice_en_20894920.wav
small/test/wav/common_voice_en_18743946.wav
small/test/wav/common_voice_en_22924695.wav
small/test/wav/common_voice_en_18433532.wav
small/test/wav/common_voice_en_20105454.wav


 98%|█████████▊| 195/200 [00:09<00:00, 19.99it/s]

small/test/wav/common_voice_en_20472912.wav
small/test/wav/common_voice_en_21130256.wav
small/test/wav/common_voice_en_21143201.wav
small/test/wav/common_voice_en_21328728.wav


100%|██████████| 200/200 [00:10<00:00, 19.53it/s]

small/test/wav/common_voice_en_21333738.wav
small/test/wav/common_voice_en_21343030.wav
small/test/wav/common_voice_en_21867270.wav
small/test/wav/common_voice_en_22924255.wav
small/test/wav/common_voice_en_23593247.wav



100%|██████████| 200/200 [00:00<00:00, 19470.81it/s]
