# FAE Common Voice 2022

#### Foreign-Accented English from the Common Voice crowdsourced corpus.

This project is based on Mozilla's Common-Voice version `cv-corpus-10.0-2022-07-04`.

The target criteria are:

- Use `validated.tsv` recordings only.
- ~20sec per speaker.
- 100-500 speakers per class (accent).
- Gender balance.
- Splits without any speaker overlap.


Output:
|      File        | Content                                           |
|:----------------:|:--------------------------------------------------|
|  `config.yaml`   | Parmeters needed to duplicate the assembly.       |
|   `train.tsv`    | Split for model training (fine-tuning).           |
|     `dev.tsv`    | Split for validation, all in-set labels.          |
|    `test.tsv`    | Split for test, all in-set labels                 |
|    `eval.tsv`    | plit from left over data, some out-of-set labels. |

---
---


In [1]:
import os, sys
import tarfile
from omegaconf import OmegaConf

import subprocess
from datetime import datetime
import time
import pandas as pd
import numpy as np
import random
import csv
from tqdm import tqdm
import sox
from pandarallel import pandarallel

MAX_NB_CPU_WORKERS = min(24, int(os.cpu_count() / 4))
pandarallel.initialize(
    nb_workers=MAX_NB_CPU_WORKERS, use_memory_fs=False, progress_bar=True
)
tqdm.pandas()


INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


---

#### Constants


In [2]:
cfg = OmegaConf.create()

cfg["RANDOM_STATE"] = 42
# INPUT DATA
cfg["data"] = dict(
    CORPUS_NAME="cv-corpus-10.0-2022-07-04", 
    LANGUAGE="en", 
    DIR_CORPORA="/corpora", 
    DL_HOST='voice-prod-bundler-ee1969a6ce8178826482b88e843c335139bd3fb4.s3.amazonaws.com',
)

# TARGET STATS
cfg["tgt_stats"] = dict(
    MIN_NR_SPK=65, SAMPLES_PER_SPKR=4, MAX_NUM_SPKRS=500, DUR_CUT_OFF_Q=0.995
)
# Q=99.5% discards 1% of recordings: 0.5% shortest and 0.5% longest).
cfg.data[
    "COMMON_VOICE_URL"
] = f"https://{cfg.data.DL_HOST}/{cfg.data.CORPUS_NAME}/{cfg.data.CORPUS_NAME}-{cfg.data.LANGUAGE}.tar.gz"
cfg.data[
    "PATH_VALIDATED_TSV"
] = f"{cfg.data.DIR_CORPORA}/{cfg.data.CORPUS_NAME}/en/validated.tsv"
cfg.data[
    "PATH_MAPPING_TSV"
] = f"../data/mappings-accents_{cfg.data.CORPUS_NAME}_{cfg.data.LANGUAGE}_v2209.tsv"
cfg.tgt_stats["HALF_MAX_NUM_SPKRS"] = int(cfg.tgt_stats.MAX_NUM_SPKRS / 2)

In [3]:
# Inits and utils.
random.seed(cfg.RANDOM_STATE)
def getDuration(
    path, base_dir=f"{cfg.data.DIR_CORPORA}/{cfg.data.CORPUS_NAME}/en/clips/"
):
    return sox.file_info.duration(f"{base_dir}/{path}")

---

#### Download Common-Voice if `validated.tsv` file is not found.


In [4]:
if os.path.exists(cfg.data.PATH_VALIDATED_TSV):
    print(f"Found existing validated.tsv file: {cfg.data.PATH_VALIDATED_TSV}")
else:
    print(f"Could not find Common Voice, downloading corpus...")
    output_archive_filename = "../data/cv-en.tar.gz"
    commands = [
        "wget",
        "--user-agent",
        '"Mozilla/5.0 (Windows NT 10.0; WOW64) '
        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"',
        "-O",
        output_archive_filename,
        f"{cfg.data.COMMON_VOICE_URL}",
    ]
    commands = " ".join(commands)
    subprocess.run(
        commands, shell=True, stderr=sys.stderr, stdout=sys.stdout, capture_output=False
    )
    tar = tarfile.open(output_archive_filename)
    try:
        tar.extractall(cfg.data.DIR_CORPORA)
        print(f"extracted to: {cfg.data.DIR_CORPORA}")
    except:
        print(f"unable to extract to: {cfg.data.DIR_CORPORA}, trying ../data instead.")
        DIR_CORPORA = "../data"
        PATH_VALIDATED_TSV = f"{DIR_CORPORA}/{cfg.data.CORPUS_NAME}/en/validated.tsv"
        tar.extractall(DIR_CORPORA)
        print(f"extracted to: {DIR_CORPORA}")
    tar.close()
    os.remove(output_archive_filename)

Found existing validated.tsv file: /corpora/cv-corpus-10.0-2022-07-04/en/validated.tsv


---

#### Load our hand-made mapping table for the accents column, and then load the original common-voice TSV file with "validated" entries.


In [5]:
acc2labMapperDF = pd.read_csv(
    cfg.data.PATH_MAPPING_TSV, sep="\t", names=["label", "accents"], header=None
)

In [6]:
df = pd.read_csv(
    cfg.data.PATH_VALIDATED_TSV,
    sep="\t",
    parse_dates=False,
    engine="python",
    encoding="utf-8",
    on_bad_lines="warn",
    quotechar='"',
    quoting=csv.QUOTE_NONE,
)
print(f"loaded orig_corpus.shape:{df.shape}")


loaded orig_corpus.shape:(1589008, 10)


In [7]:
# Add column to indicate if the speaker is a confirmed female or not.
isFemaleL = df[df.gender == "female"].client_id.unique().tolist()
df["isfemale"] = False
df.loc[df.client_id.isin(isFemaleL), "isfemale"] = True
# quick sanity check.
assert df[df.isfemale].client_id.nunique() == len(isFemaleL)
print(f"found {len(isFemaleL)} unique confirmed-female speakers")

found 4047 unique confirmed-female speakers


---
---

#### Start the data clean up by removing unusable lines.


In [8]:
# Start trimming out lines.
#
df2 = df[df.accents.notnull()]
print(f" - removed lines without `accents` value. \t new shape: {df2.shape}")
#
df3 = pd.merge(left=df2, right=acc2labMapperDF, on="accents", how="left")
print(f" - mapped `accents` values to a `label`. \t new shape: {df3.shape}")
#
df3 = df3[df3.label.notnull()]
print(f" - removed lines without `label` mapping. \t new shape: {df3.shape}")
#
df3 = df3[df3.label != "-"]
print(f" - removed lines with `label=-` value. \t new shape: {df3.shape}")
#
# keep some speakers without enough recordings as residualDF (but at least 2).
residualDF = df3[
    (df3.groupby("client_id").client_id.transform("size")
    < cfg.tgt_stats.SAMPLES_PER_SPKR) & 
    (df3.groupby("client_id").client_id.transform("size")
    >= 2)
]
#
df3 = df3[
    df3.groupby("client_id").client_id.transform("size")
    >= cfg.tgt_stats.SAMPLES_PER_SPKR
]
print(
    f" - removed speakers with less than {cfg.tgt_stats.SAMPLES_PER_SPKR}. \t new shape: {df3.shape}"
)
print(f" ->extracted the residual data with shape: {residualDF.shape}")

 - removed lines without `accents` value. 	 new shape: (841963, 11)
 - mapped `accents` values to a `label`. 	 new shape: (841963, 12)
 - removed lines without `label` mapping. 	 new shape: (833304, 12)
 - removed lines with `label=-` value. 	 new shape: (833194, 12)
 - removed speakers with less than 4. 	 new shape: (828901, 12)
 ->extracted the residual data with shape: (3708, 12)


---

#### Opportunistically, use the `residualDF` to assemble the `Eval` Set.

- Speakers in the `residualDF` above would have been ignored anyway because they do not have enough recordings.
- Discard speakers with only 1 recording.
- Sample 2 recordings/speaker.


In [9]:
# Add duration info.
residualDF['duration'] = residualDF.path.parallel_map(getDuration).values

# trim length
Q = cfg.tgt_stats.DUR_CUT_OFF_Q
dur_cut_off_upper = residualDF.duration.quantile(q=Q)
dur_cut_off_lower = residualDF.duration.quantile(q=1-Q)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=155), Label(value='0 / 155'))), HB…

In [10]:
print(f'keeping only recordings with duration between {dur_cut_off_upper} - {dur_cut_off_lower}')
print(f" - residual original shape: {residualDF.shape}")
residualDF = residualDF[ (residualDF.duration >= dur_cut_off_lower) &  (residualDF.duration <= dur_cut_off_upper) ]
print(f" - residual new shape: {residualDF.shape}")
#
# Drop speakers without at least 2 recordings.
residualDF = residualDF[
    residualDF.groupby("client_id").client_id.transform("size") >= 2
]
print(f" - residual data with shape: {residualDF.shape}")

keeping only recordings with duration between 10.763160000000005 - 1.868100000000001
 - residual original shape: (3708, 13)
 - residual new shape: (3670, 13)
 - residual data with shape: (3660, 13)


This table below is what we are working with:


In [11]:
# Selection criteria:
# - Two recording per speaker.
# - As many speakers/accent-label as available while keeping female/notFemale balanced.
#  
print(residualDF.groupby(['label','isfemale']).client_id.nunique())

label        isfemale
africa       True          1
australia    False        27
             True          6
bermuda      False         5
             True          2
canada       False        54
             True          7
england      False       166
             True         21
germany      False         2
hispanic     False         1
             True          1
hongkong     False        12
             True          6
india        False       282
             True         26
ireland      False        18
             True          1
malaysia     False        10
             True          3
newzealand   False        14
             True          3
philippines  False         9
             True          4
scotland     False        14
             True          1
singapore    False         8
sweden       False         1
us           False       595
             True        112
wales        False         4
             True          1
Name: client_id, dtype: int64


In [12]:
# Limit the samples/speaker to only 2.
residualDF = pd.DataFrame(
    residualDF.groupby("client_id").sample(
        n=2,
        replace=False,
        random_state=cfg.RANDOM_STATE,
    )
)

In [13]:
EvalDF = None
for acc in residualDF.label.unique():
    #
    # Subsample speakers to reach gender balance.
    _femDF = residualDF[(residualDF.label == acc) & residualDF.isfemale]
    _notFemDF = residualDF[(residualDF.label == acc) & ~residualDF.isfemale]
    #
    nr_spkrs = min(_femDF.client_id.nunique(), _notFemDF.client_id.nunique())
   
    _femDF = _femDF.sample(
                n=nr_spkrs,
                replace=False,
                random_state=cfg.RANDOM_STATE,
            )
    _notFemDF = _notFemDF.sample(
                n=nr_spkrs,
                replace=False,
                random_state=cfg.RANDOM_STATE,
            )
    if EvalDF is None:
        EvalDF = pd.concat([_femDF, _notFemDF])
    else:
        EvalDF = pd.concat([EvalDF, _femDF, _notFemDF])

This table below is what we ende up with:


In [14]:
EvalDF.groupby(['label','isfemale']).path.count()

label        isfemale
australia    False         6
             True          6
bermuda      False         2
             True          2
canada       False         7
             True          7
england      False        21
             True         21
hispanic     False         1
             True          1
hongkong     False         6
             True          6
india        False        26
             True         26
ireland      False         1
             True          1
malaysia     False         3
             True          3
newzealand   False         3
             True          3
philippines  False         4
             True          4
scotland     False         1
             True          1
us           False       112
             True        112
wales        False         1
             True          1
Name: path, dtype: int64

---
---

#### Find the set of eligible accents with sufficient speaker diversity.


In [15]:
df3.groupby("label").client_id.nunique()

label
africa            2
australia       595
bermuda          34
canada          795
england        1995
france            1
germany           8
hispanic          1
holland           3
hongkong         99
india          1553
ireland         160
israel            1
italy             1
malaysia         79
newzealand      135
norway            1
philippines     109
poland            4
russian           1
scotland        141
singapore        61
thailand          2
us             6390
wales            59
Name: client_id, dtype: int64

In [16]:
# Eligible accents: at least MIN_NR_SPK.
speakersDF = df3.groupby("label").client_id.nunique()
speakersDF = speakersDF[speakersDF > cfg.tgt_stats.MIN_NR_SPK]
usableAccents = speakersDF.keys().tolist()
# nr of speakers per accent label:
print(f"found {len(speakersDF)} eligible accents")
speakersDF

found 11 eligible accents


label
australia       595
canada          795
england        1995
hongkong         99
india          1553
ireland         160
malaysia         79
newzealand      135
philippines     109
scotland        141
us             6390
Name: client_id, dtype: int64

In [17]:
df4 = df3[df3.label.isin(usableAccents)]
print(f" - removed lines with unusable accents. \t new shape: {df4.shape}")


 - removed lines with unusable accents. 	 new shape: (781674, 12)


---

#### Find the time-duration for each audio file and remove outliers (too short/long).


In [18]:
# NOTE: This may take a couple of minutes (maybe 8mins).
# Add duration information column.
try:
    durs = np.fromfile(open("../data/durs.csv"))
    print("loaded durs from ../data/durs.csv")
except:
    durs = df4.path.parallel_map(getDuration).values
    durs.tofile(open("../data/durs.csv", "w"))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=32570), Label(value='0 / 32570')))…

In [19]:
df5 = df4.assign(duration=durs)

In [20]:
Q = cfg.tgt_stats.DUR_CUT_OFF_Q
dur_cut_off_upper = df5.duration.quantile(q=Q)
dur_cut_off_lower = df5.duration.quantile(q=1-Q)
print(
    f" // keeping {100*(1-2*(1-cfg.tgt_stats.DUR_CUT_OFF_Q))}% of recordings.\n // dropped recordings <{dur_cut_off_lower}s and >{dur_cut_off_upper}s."
)
df5 = df5[(df5.duration <= dur_cut_off_upper) & (df5.duration >= dur_cut_off_lower)]
print(f" - removed lines with too long/short duration. \t new shape: {df5.shape}")

 // keeping 99.0% of recordings.
 // dropped recordings <1.68s and >9.756s.
 - removed lines with too long/short duration. 	 new shape: (773907, 13)


In [21]:
# One more time discard speakers with less than SAMPLES_PER_SPKR because some may have lost recordings.
df5 = df5[
    df5.groupby("client_id").client_id.transform("size")
    >= cfg.tgt_stats.SAMPLES_PER_SPKR
]
print(
    f" - removed speakers with less than {cfg.tgt_stats.SAMPLES_PER_SPKR}. \t new shape: {df5.shape}"
)

 - removed speakers with less than 4. 	 new shape: (773711, 13)


---

#### Add column indicating if speaker is a confirmed female to guide the following gender balance.


In [22]:
isFemaleL = df5[df5.gender == "female"].client_id.unique().tolist()
df5["isfemale"] = False
df5.loc[df5.client_id.isin(isFemaleL), "isfemale"] = True
print(f"found {len(isFemaleL)} unique confirmed-female speakers")

found 2391 unique confirmed-female speakers


In [23]:
# quick sanity check.
assert df5[df5.isfemale].client_id.nunique() == len(isFemaleL)

In [24]:
# Observe the current gender-balance per accent label.
df5.groupby(by=["label", "isfemale"])[["client_id", "accents"]].nunique()


Unnamed: 0_level_0,Unnamed: 1_level_0,client_id,accents
label,isfemale,Unnamed: 2_level_1,Unnamed: 3_level_1
australia,False,476,5
australia,True,115,1
canada,False,594,2
canada,True,198,3
england,False,1657,11
england,True,328,3
hongkong,False,71,2
hongkong,True,28,1
india,False,1355,7
india,True,182,2


---
---

### Balance by subsampling


In [25]:
outDF = pd.DataFrame(columns=[df5.columns])
for acc in usableAccents:
    #
    accDF = df5[df5.label == acc]
    speakersL = accDF.client_id.unique().tolist()
    if len(speakersL) > cfg.tgt_stats.MAX_NUM_SPKRS:
        # Too many speakers, let's undersample.
        accFemDF = accDF[accDF.isfemale]
        accNotFemDF = accDF[~accDF.isfemale]
        #
        speakersFemL = accFemDF.client_id.unique().tolist()
        speakersNotFemL = accNotFemDF.client_id.unique().tolist()
        #
        assert len(speakersL) == (len(speakersFemL) + len(speakersNotFemL))
        #
        numFemSpk = len(speakersFemL)
        numNotFemSpk = len(speakersNotFemL)
        #
        # Can afford to subsample notFem speakers?
        if numNotFemSpk > cfg.tgt_stats.HALF_MAX_NUM_SPKRS:
            # --> Subsample notFem (update lists with subsampled versions).
            speakersNotFemL = accNotFemDF.sample(
                n=max(
                    cfg.tgt_stats.HALF_MAX_NUM_SPKRS,
                    cfg.tgt_stats.MAX_NUM_SPKRS - numFemSpk,
                ),
                replace=False,
                random_state=cfg.RANDOM_STATE,
            ).client_id.tolist()
        #
        # Can afford to subsample fem speakers?
        if numFemSpk > cfg.tgt_stats.HALF_MAX_NUM_SPKRS:
            # --> Subsample Fem (update lists with subsampled versions).
            speakersFemL = accFemDF.sample(
                n=max(
                    cfg.tgt_stats.HALF_MAX_NUM_SPKRS,
                    cfg.tgt_stats.MAX_NUM_SPKRS - numNotFemSpk,
                ),
                replace=False,
                random_state=cfg.RANDOM_STATE,
            ).client_id.tolist()
        #
        speakersL = speakersFemL + speakersNotFemL
        assert len(speakersL) <= cfg.tgt_stats.MAX_NUM_SPKRS
        #
    # speaker-trimmed:
    accDF2 = accDF[accDF.client_id.isin(speakersL)]
    #
    # limit the amount of samples per speaker.
    accDF3 = pd.DataFrame(
        accDF2.groupby("client_id").sample(
            n=cfg.tgt_stats.SAMPLES_PER_SPKR,
            replace=False,
            random_state=cfg.RANDOM_STATE,
        )
    )
    if outDF.empty:
        outDF = accDF3
    else:
        outDF = pd.concat([outDF, accDF3])

#### Partition into Test, Development and Training sets.

- No speaker-overlap
- Maintain the gender proportion.

In [26]:
# #####################################################################
# partition by separating speakers, keeping male/female balance.
#

DevDF = pd.DataFrame(columns=[df5.columns])
TrainDF = pd.DataFrame(columns=[df5.columns])
TestDF = pd.DataFrame(columns=[df5.columns])


print("accent    \t E|D|T spkrs \t E|D|T samples")
print("-------------------------------------------------")
for acc in usableAccents:
    accDF = outDF[outDF.label == acc]
    femSpkrsDF = pd.DataFrame(
        accDF[accDF.isfemale].client_id.unique(),
        columns=["client_id"],
    )
    notFemSpkrsDF = pd.DataFrame(
        accDF[~accDF.isfemale].client_id.unique(),
        columns=["client_id"],
    )
    testSpeakersDF = pd.concat(
        [
            femSpkrsDF.sample(frac=0.10, replace=False, random_state=cfg.RANDOM_STATE),
            notFemSpkrsDF.sample(
                frac=0.10, replace=False, random_state=cfg.RANDOM_STATE
            ),
        ]
    )
    accTestDF = accDF[accDF.client_id.isin(testSpeakersDF.client_id)]
    #
    #
    # Continue splitting
    # - rm already used speakers.
    femSpkrsDF = femSpkrsDF[~femSpkrsDF.client_id.isin(testSpeakersDF.client_id)]
    notFemSpkrsDF = notFemSpkrsDF[
        ~notFemSpkrsDF.client_id.isin(testSpeakersDF.client_id)
    ]
    # NOTE: divide by 0.9 to compensate for the missing 10%.
    devSpeakersDF = pd.concat(
        [
            femSpkrsDF.sample(
                frac=0.1 / 0.9, replace=False, random_state=cfg.RANDOM_STATE
            ),
            notFemSpkrsDF.sample(
                frac=0.1 / 0.9, replace=False, random_state=cfg.RANDOM_STATE
            ),
        ]
    )
    accDevDF = accDF[accDF.client_id.isin(devSpeakersDF.client_id)]
    #
    #
    # Use whatever speakers are left for training.
    femSpkrsDF = femSpkrsDF[~femSpkrsDF.client_id.isin(devSpeakersDF.client_id)]
    notFemSpkrsDF = notFemSpkrsDF[
        ~notFemSpkrsDF.client_id.isin(devSpeakersDF.client_id)
    ]
    trainSpeakersDF = pd.concat([femSpkrsDF, notFemSpkrsDF])
    accTrainDF = accDF[accDF.client_id.isin(trainSpeakersDF.client_id)]
    #
    #
    print(
        f"{acc} \t {accTestDF.client_id.nunique()}|{accDevDF.client_id.nunique()}|{accTrainDF.client_id.nunique()} \t {accTestDF.shape[0]}|{accDevDF.shape[0]}|{accTrainDF.shape[0]}"
    )
    #
    #
    # sanity check
    for _df in (accTestDF, accDevDF, accTrainDF):
        assert _df.client_id.nunique() * cfg.tgt_stats.SAMPLES_PER_SPKR == _df.shape[0]
    #
    #
    # agregate
    if TestDF.empty:
        TestDF = accTestDF
    else:
        TestDF = pd.concat([TestDF, accTestDF])
    #
    if DevDF.empty:
        DevDF = accDevDF
    else:
        DevDF = pd.concat([DevDF, accDevDF])
    #
    if TrainDF.empty:
        TrainDF = accTrainDF
    else:
        TrainDF = pd.concat([TrainDF, accTrainDF])

accent    	 E|D|T spkrs 	 E|D|T samples
-------------------------------------------------
australia 	 26|25|201 	 104|100|804
canada 	 34|34|270 	 136|136|1080
england 	 27|27|213 	 108|108|852
hongkong 	 10|10|79 	 40|40|316
india 	 35|35|280 	 140|140|1120
ireland 	 16|16|128 	 64|64|512
malaysia 	 8|8|63 	 32|32|252
newzealand 	 13|14|107 	 52|56|428
philippines 	 11|11|87 	 44|44|348
scotland 	 14|14|112 	 56|56|448
us 	 35|35|278 	 140|140|1112


---

#### Save resulting corpus.


In [27]:
def createNewLog(cfg):
    timeInSeconds = time.time()
    timestamp = datetime.fromtimestamp(timeInSeconds).strftime("%Y-%m-%d_%H-%M-%S")

    directory = f"../logs/{timestamp}"
    # ensure log folder exists
    if not os.path.exists(directory):
        os.mkdir(directory)

    logFile = f"{directory}/config.yaml"
    OmegaConf.save(config=cfg, f=logFile)
    return directory

logdir = createNewLog(cfg)

In [28]:
EvalDF.to_csv(f"{logdir}/eval.tsv", sep="\t", index=False, header=False)
TestDF.to_csv(f"{logdir}/test.tsv", sep="\t", index=False, header=False)
DevDF.to_csv(f"{logdir}/dev.tsv", sep="\t", index=False, header=False)
TrainDF.to_csv(f"{logdir}/train.tsv", sep="\t", index=False, header=False)