In [None]:
import pandas as pd
from datasets import load_dataset, load_from_disk
from tqdm import tqdm
from collections import Counter, OrderedDict
import os, shutil, csv

In [None]:
SAMPLE_RATE = 16000

_FLEURS_LANG_TO_ID = OrderedDict([("Afrikaans", "af"), ("Amharic", "am"), ("Arabic", "ar"), ("Armenian", "hy"), ("Assamese", "as"), ("Asturian", "ast"), ("Azerbaijani", "az"), ("Belarusian", "be"), ("Bengali", "bn"), ("Bosnian", "bs"), ("Bulgarian", "bg"), ("Burmese", "my"), ("Catalan", "ca"), ("Cebuano", "ceb"), ("Mandarin Chinese", "cmn_hans"), ("Cantonese Chinese", "yue_hant"), ("Croatian", "hr"), ("Czech", "cs"), ("Danish", "da"), ("Dutch", "nl"), ("English", "en"), ("Estonian", "et"), ("Filipino", "fil"), ("Finnish", "fi"), ("French", "fr"), ("Fula", "ff"), ("Galician", "gl"), ("Ganda", "lg"), ("Georgian", "ka"), ("German", "de"), ("Greek", "el"), ("Gujarati", "gu"), ("Hausa", "ha"), ("Hebrew", "he"), ("Hindi", "hi"), ("Hungarian", "hu"), ("Icelandic", "is"), ("Igbo", "ig"), ("Indonesian", "id"), ("Irish", "ga"), ("Italian", "it"), ("Japanese", "ja"), ("Javanese", "jv"), ("Kabuverdianu", "kea"), ("Kamba", "kam"), ("Kannada", "kn"), ("Kazakh", "kk"), ("Khmer", "km"), ("Korean", "ko"), ("Kyrgyz", "ky"), ("Lao", "lo"), ("Latvian", "lv"), ("Lingala", "ln"), ("Lithuanian", "lt"), ("Luo", "luo"), ("Luxembourgish", "lb"), ("Macedonian", "mk"), ("Malay", "ms"), ("Malayalam", "ml"), ("Maltese", "mt"), ("Maori", "mi"), ("Marathi", "mr"), ("Mongolian", "mn"), ("Nepali", "ne"), ("Northern-Sotho", "nso"), ("Norwegian", "nb"), ("Nyanja", "ny"), ("Occitan", "oc"), ("Oriya", "or"), ("Oromo", "om"), ("Pashto", "ps"), ("Persian", "fa"), ("Polish", "pl"), ("Portuguese", "pt"), ("Punjabi", "pa"), ("Romanian", "ro"), ("Russian", "ru"), ("Serbian", "sr"), ("Shona", "sn"), ("Sindhi", "sd"), ("Slovak", "sk"), ("Slovenian", "sl"), ("Somali", "so"), ("Sorani-Kurdish", "ckb"), ("Spanish", "es"), ("Swahili", "sw"), ("Swedish", "sv"), ("Tajik", "tg"), ("Tamil", "ta"), ("Telugu", "te"), ("Thai", "th"), ("Turkish", "tr"), ("Ukrainian", "uk"), ("Umbundu", "umb"), ("Urdu", "ur"), ("Uzbek", "uz"), ("Vietnamese", "vi"), ("Welsh", "cy"), ("Wolof", "wo"), ("Xhosa", "xh"), ("Yoruba", "yo"), ("Zulu", "zu")])
_FLEURS_LANG_SHORT_TO_LONG = {v: k for k, v in _FLEURS_LANG_TO_ID.items()}

_FLEURS_LANG = sorted(["af_za", "am_et", "ar_eg", "as_in", "ast_es", "az_az", "be_by", "bn_in", "bs_ba", "ca_es", "ceb_ph", "cmn_hans_cn", "yue_hant_hk", "cs_cz", "cy_gb", "da_dk", "de_de", "el_gr", "en_us", "es_419", "et_ee", "fa_ir", "ff_sn", "fi_fi", "fil_ph", "fr_fr", "ga_ie", "gl_es", "gu_in", "ha_ng", "he_il", "hi_in", "hr_hr", "hu_hu", "hy_am", "id_id", "ig_ng", "is_is", "it_it", "ja_jp", "jv_id", "ka_ge", "kam_ke", "kea_cv", "kk_kz", "km_kh", "kn_in", "ko_kr", "ckb_iq", "ky_kg", "lb_lu", "lg_ug", "ln_cd", "lo_la", "lt_lt", "luo_ke", "lv_lv", "mi_nz", "mk_mk", "ml_in", "mn_mn", "mr_in", "ms_my", "mt_mt", "my_mm", "nb_no", "ne_np", "nl_nl", "nso_za", "ny_mw", "oc_fr", "om_et", "or_in", "pa_in", "pl_pl", "ps_af", "pt_br", "ro_ro", "ru_ru", "bg_bg", "sd_in", "sk_sk", "sl_si", "sn_zw", "so_so", "sr_rs", "sv_se", "sw_ke", "ta_in", "te_in", "tg_tj", "th_th", "tr_tr", "uk_ua", "umb_ao", "ur_pk", "uz_uz", "vi_vn", "wo_sn", "xh_za", "yo_ng", "zu_za"])
_FLEURS_LONG_TO_LANG = {_FLEURS_LANG_SHORT_TO_LONG["_".join(k.split("_")[:-1]) or k]: k for k in _FLEURS_LANG}
_FLEURS_LANG_TO_LONG = {v: k for k, v in _FLEURS_LONG_TO_LANG.items()}

_FLEURS_GROUP_TO_LONG = OrderedDict({
    "western_european_we": ["Asturian", "Bosnian", "Catalan", "Croatian", "Danish", "Dutch", "English", "Finnish", "French", "Galician", "German", "Greek", "Hungarian", "Icelandic", "Irish", "Italian", "Kabuverdianu", "Luxembourgish", "Maltese", "Norwegian", "Occitan", "Portuguese", "Spanish", "Swedish", "Welsh"],
    "eastern_european_ee": ["Armenian", "Belarusian", "Bulgarian", "Czech", "Estonian", "Georgian", "Latvian", "Lithuanian", "Macedonian", "Polish", "Romanian", "Russian", "Serbian", "Slovak", "Slovenian", "Ukrainian"],
    "central_asia_middle_north_african_cmn": ["Arabic", "Azerbaijani", "Hebrew", "Kazakh", "Kyrgyz", "Mongolian", "Pashto", "Persian", "Sorani-Kurdish", "Tajik", "Turkish", "Uzbek"],
    "sub_saharan_african_ssa": ["Afrikaans", "Amharic", "Fula", "Ganda", "Hausa", "Igbo", "Kamba", "Lingala", "Luo", "Northern-Sotho", "Nyanja", "Oromo", "Shona", "Somali", "Swahili", "Umbundu", "Wolof", "Xhosa", "Yoruba", "Zulu"],
    "south_asian_sa": ["Assamese", "Bengali", "Gujarati", "Hindi", "Kannada", "Malayalam", "Marathi", "Nepali", "Oriya", "Punjabi", "Sindhi", "Tamil", "Telugu", "Urdu"],
    "south_east_asian_sea": ["Burmese", "Cebuano", "Filipino", "Indonesian", "Javanese", "Khmer", "Lao", "Malay", "Maori", "Thai", "Vietnamese"],
    "chinese_japanase_korean_cjk": ["Mandarin Chinese", "Cantonese Chinese", "Japanese", "Korean"],
})
_FLEURS_LONG_TO_GROUP = {a: k for k, v in _FLEURS_GROUP_TO_LONG.items() for a in v}
_FLEURS_LANG_TO_GROUP = {_FLEURS_LONG_TO_LANG[k]: v for k, v in _FLEURS_LONG_TO_GROUP.items()}

_ALL_LANG = _FLEURS_LANG

In [None]:
def build_romanizer():
    romanizer = ur.Uroman() # Usage: romanizer.romanize_string(text, lcode=iso)
    
    return romanizer

In [None]:
# !git lfs install
# !git clone https://huggingface.co/datasets/google/fleurs-r

In [None]:
fleurs_dir = 'path/to/fleurs-r/dataset'
fleurs_meta_dir = 'path/to/fleurs-r/dataset/metadata'

In [None]:
splits = ['train', 'dev', 'test']
columns = ['id', 'filename', 'raw transcription', 'transcription', 'character', 'sample length', 'gender']
csvs = {
    'train': [],
    'dev': [],
    'test': []
}
for lang_dir in tqdm(os.listdir(fleurs_meta_dir)):
    for split in splits:
        target_fname = os.path.join(fleurs_meta_dir, lang_dir, f'{split}.tsv')
        # print(lang_dir, split)
        data = pd.read_csv(target_fname, sep='\t', names=columns, encoding='utf-8', quoting=csv.QUOTE_NONE)
        if data.isna().any().any():
            print(f'Error occured in {target_fname}')
            nan_rows = data[data.isna().any(axis=1)]
            nan_index = nan_rows.index
            for index in nan_index:
                # print(lang_dir, split, index)
                t, c = data.loc[index, 'transcription'].strip().split('\t')
                t = t.strip()
                c = c.strip()
                # print(t, '\n', c)
                data.loc[index, 'gender'] = str(data.loc[index, 'sample length'])
                data.loc[index, 'sample length'] = int(data.loc[index, 'character'])
                data.loc[index, 'character'] = c
                data.loc[index, 'transcription'] = t
                # print('-------------------------------')
                # print(data.loc[index, 'sample length'], type(data.loc[index, 'character']))
                # print('-------------------------------')
        length = len(data)
        lang_ids = [_FLEURS_LANG.index(lang_dir) for i in range(length)]
        langs = [_FLEURS_LANG_TO_LONG[lang_dir] for i in range(length)]
        lang_groups = [_FLEURS_LANG_TO_GROUP[lang_dir] for i in range(length)]
        data['duration'] = round(data['sample length'].astype(int) / SAMPLE_RATE, 2)
        data['language id'] = lang_ids
        data['language'] = langs
        data['language group'] = lang_groups
        csvs[split].append(data)

In [None]:
# Sanity Check
for k, v in csvs.items():
    # print(f'Split {k} have {len(v)} dataframes')
    assert len(v)==102
    
    for df in v:
        # print(df.isna().any(axis=1).sum())
        assert not df.isna().any().any()
    

In [None]:
merged = {
    'train': pd.DataFrame(),
    'dev': pd.DataFrame(),
    'test': pd.DataFrame()
}
for k, v in csvs.items():
    merged[k] = pd.concat(v, ignore_index=True)

In [None]:
# Sanity Check
for k, v in merged.items():
    assert not v.isna().any().any()

In [None]:
save_path = os.path.join(fleurs_dir, 'csvs')
os.makedirs(save_path, exist_ok=True)
for k, v in merged.items():
    v.to_csv(os.path.join(save_path, f'{k}.csv'), index=False)

In [None]:
train = pd.read_csv(os.path.join(save_path, 'train.csv'))
dev = pd.read_csv(os.path.join(save_path, 'dev.csv'))
test = pd.read_csv(os.path.join(save_path, 'test.csv'))