## Imports

In [None]:
import sys
import os
import csv
import subprocess
import pandas as pd
import numpy as np
import logging
logger = logging.getLogger(__name__)


## Definitions

In [None]:
tsv_path = "/home/usuaris/scratch/gerard.muniesa/speech-tech-resources/data/cv-corpus-9.0-2022-04-27/ca/validated.tsv"
cv_path = "/home/usuaris/scratch/gerard.muniesa/common_voice/"

## Utils

In [None]:
def split_df_by_gender(dataset):

    print(f"Total raws:  {(dataset.shape[0])}")
    
    non_gender_specified_raws = len(np.intersect1d(dataset[dataset['gender'].isnull()]['client_id'].unique(), dataset[dataset['gender'] == 'male']['client_id'].unique())) \
        + len(np.intersect1d(dataset[dataset['gender'].isnull()]['client_id'].unique(), dataset[dataset['gender'] == 'female']['client_id'].unique())) + \
            len(np.intersect1d(dataset[dataset['gender'].isnull()]['client_id'].unique(), dataset[dataset['gender'] == 'other']['client_id'].unique()))

    print(f"Number of raws withouth the known gender written {non_gender_specified_raws}")

    print("Correcting...")    
    for user in np.ndarray.tolist(np.intersect1d(dataset[dataset['gender'].isnull()]['client_id'].unique(), dataset[dataset['gender'] == 'male']['client_id'].unique())):
        dataset['gender'] = dataset['gender'].where((dataset['client_id'] != user), 'male')
    print("Correcting...")
    for user in np.ndarray.tolist(np.intersect1d(dataset[dataset['gender'].isnull()]['client_id'].unique(), dataset[dataset['gender'] == 'female']['client_id'].unique())):
        dataset['gender'] = dataset['gender'].where((dataset['client_id'] != user), 'female')
    print("Correcting...")
    for user in np.ndarray.tolist(np.intersect1d(dataset[dataset['gender'].isnull()]['client_id'].unique(), dataset[dataset['gender'] == 'other']['client_id'].unique())):
        dataset['gender'] = dataset['gender'].where((dataset['client_id'] != user), 'other')
    print("Corrected.")

    male = dataset[dataset['gender'] == 'male']
    female = dataset[dataset['gender'] == 'female']
    nan = dataset[dataset['gender'].isnull()]
    other = dataset[dataset['gender'] == 'other']
    print("----------Summary---------")
    print(f"Male rwas:   {male.shape[0]}")
    print(f"Female rwas: {female.shape[0]}")
    print(f"Other rwas:  {other.shape[0]}")
    print(f"NaN rwas:    {nan.shape[0]}    +")
    print("--------------------------")
    result = male.shape[0] + female.shape[0] + other.shape[0] + nan.shape[0]
    error = dataset.shape[0] - result
    print(f"             {result}     <--> Total raws:  {(dataset.shape[0])} (error: {error})")
    return dataset, male, female, other, nan

In [None]:
def get_durations_dict(filename):
    durations = {}
    for line in open(filename).readlines():
        d = line.split(',')
        if d[1]=='N/A\n':
            continue
        if float(d[1]) > 10: # Discard samples bigger than 10s 
            continue
        durations[d[0].split('/')[-1]] = float(d[1])
    return durations

## Code

Correct and split by gender the dataset

In [None]:
df = pd.read_csv(tsv_path, sep='\t')
df , male_df, female_df, other_df, nan_df = split_df_by_gender(df)

Preprocess audiofiles, create the necessar folders containing the preprocessed audios and extract the duration of each audio to "sil_states.tsv" file.

In [None]:
subprocess.call(["mkdir", cv_path+"clips_wavs"])
os.mkdir("%sclips_wav_22k"%cv_path)
os.mkdir("%sclips_wav_22k_sil"%cv_path)
os.mkdir("%sclips_wav_22k_sil_pad"%cv_path)
os.system("for f in %ssmall_clips_2/*.mp3;"%cv_path +" do t=${f%.mp3}.wav; cp $f $t; done")
os.system(f"mv {cv_path}small_clips_2/*.wav  {cv_path}clips_wavs/")
os.system("for f in %sclips_wavs/*.wav; do t=${f##*/}; ffmpeg -i $f -ar 22050 %sclips_wav_22k/$t -v error < /dev/null; done;"%(cv_path,cv_path))
command_1 = "for f in %sclips_wav_22k/*.wav; do t=${f##*/}; sox $f %sclips_wav_22k_sil/$t"%(cv_path,cv_path) + " silence 1 0.02 0.1% reverse silence 1 0.02 0.1% reverse; done"
os.system(command_1)
command_2 = "for f in %sclips_wav_22k_sil/*.wav; do d=`ffprobe -i $f -show_entries format=duration -v quiet -of csv="%cv_path +'"p=0"'+"`; echo $f,$d; done >> %ssil_stats.csv"%cv_path
os.system(command_2)

Create tsv files for all genders with the following structure: "client_id" "time" "number of samples" "accent" "age"

In [None]:
durations = get_durations_dict(cv_path + 'sil_stats.csv')
for gender in ['male','female','other','nan']:
    if gender == 'male':
        df_ = male_df
    elif gender == 'female':
        df_ = female_df
    elif gender == 'other':
        df_ = other_df
    else:
        df_ = nan_df
    open(f'{cv_path}{gender}.tsv', mode= 'a').close()
    speakers = np.ndarray.tolist(df_['client_id'].unique())
    d_speakers = dict.fromkeys(speakers,[0, 0, "", ""])
    with open(f'{cv_path}{gender}.tsv', mode= 'wt') as f:
        tsv_writer = csv.writer(f, delimiter='\t')
        tsv_writer.writerow(['client_id', 'time', 'number of samples', 'accent', 'age'])
        for key, value in durations.items():
            if key.split('.')[0]+'.mp3' in df_.values:
                client_id_ = df_.loc[df_['path'] == key.split('.')[0]+'.mp3']['client_id'].values[0]
                client_id_ = str(client_id_)
                age = df_.loc[df_['path'] == key.split('.')[0]+'.mp3']['age'].values[0]
                accent = df_.loc[df_['path'] == key.split('.')[0]+'.mp3']['accents'].values[0]
                print(f"Before: {d_speakers[client_id_]}")
                d_speakers[client_id_] = [d_speakers[client_id_][0] + value, d_speakers[client_id_][-3] , d_speakers[client_id_][-2], d_speakers[client_id_][-1]]
                d_speakers[client_id_] = [d_speakers[client_id_][0], d_speakers[client_id_][-3] + 1, d_speakers[client_id_][-2], d_speakers[client_id_][-1]]
                print(f"After: {d_speakers[client_id_]}")
                if accent != 'nan':
                    d_speakers[client_id_] = [d_speakers[client_id_][0],d_speakers[client_id_][-3],accent, d_speakers[client_id_][-1]]
                if age != 'nan':
                    d_speakers[client_id_] = [d_speakers[client_id_][0],d_speakers[client_id_][-3],d_speakers[client_id_][-2], age]
            else:
                continue
        for key, value in d_speakers.items():
            tsv_writer.writerow([key, value[0],  value[-3], value[-2], value[-1]])