In [1]:
import os, re, json, string, pykakasi, unicodedata
import pandas as pd
import uroman as ur
from pypinyin import lazy_pinyin
from pycantonese import characters_to_jyutping
from tqdm import tqdm
from collections import Counter, OrderedDict
from IPython.display import display, Audio

In [2]:
_FLEURS_LANG_TO_ID = OrderedDict([("Afrikaans", "af"), ("Amharic", "am"), ("Arabic", "ar"), ("Armenian", "hy"), ("Assamese", "as"), ("Asturian", "ast"), ("Azerbaijani", "az"), ("Belarusian", "be"), ("Bengali", "bn"), ("Bosnian", "bs"), ("Bulgarian", "bg"), ("Burmese", "my"), ("Catalan", "ca"), ("Cebuano", "ceb"), ("Mandarin Chinese", "cmn_hans"), ("Cantonese Chinese", "yue_hant"), ("Croatian", "hr"), ("Czech", "cs"), ("Danish", "da"), ("Dutch", "nl"), ("English", "en"), ("Estonian", "et"), ("Filipino", "fil"), ("Finnish", "fi"), ("French", "fr"), ("Fula", "ff"), ("Galician", "gl"), ("Ganda", "lg"), ("Georgian", "ka"), ("German", "de"), ("Greek", "el"), ("Gujarati", "gu"), ("Hausa", "ha"), ("Hebrew", "he"), ("Hindi", "hi"), ("Hungarian", "hu"), ("Icelandic", "is"), ("Igbo", "ig"), ("Indonesian", "id"), ("Irish", "ga"), ("Italian", "it"), ("Japanese", "ja"), ("Javanese", "jv"), ("Kabuverdianu", "kea"), ("Kamba", "kam"), ("Kannada", "kn"), ("Kazakh", "kk"), ("Khmer", "km"), ("Korean", "ko"), ("Kyrgyz", "ky"), ("Lao", "lo"), ("Latvian", "lv"), ("Lingala", "ln"), ("Lithuanian", "lt"), ("Luo", "luo"), ("Luxembourgish", "lb"), ("Macedonian", "mk"), ("Malay", "ms"), ("Malayalam", "ml"), ("Maltese", "mt"), ("Maori", "mi"), ("Marathi", "mr"), ("Mongolian", "mn"), ("Nepali", "ne"), ("Northern-Sotho", "nso"), ("Norwegian", "nb"), ("Nyanja", "ny"), ("Occitan", "oc"), ("Oriya", "or"), ("Oromo", "om"), ("Pashto", "ps"), ("Persian", "fa"), ("Polish", "pl"), ("Portuguese", "pt"), ("Punjabi", "pa"), ("Romanian", "ro"), ("Russian", "ru"), ("Serbian", "sr"), ("Shona", "sn"), ("Sindhi", "sd"), ("Slovak", "sk"), ("Slovenian", "sl"), ("Somali", "so"), ("Sorani-Kurdish", "ckb"), ("Spanish", "es"), ("Swahili", "sw"), ("Swedish", "sv"), ("Tajik", "tg"), ("Tamil", "ta"), ("Telugu", "te"), ("Thai", "th"), ("Turkish", "tr"), ("Ukrainian", "uk"), ("Umbundu", "umb"), ("Urdu", "ur"), ("Uzbek", "uz"), ("Vietnamese", "vi"), ("Welsh", "cy"), ("Wolof", "wo"), ("Xhosa", "xh"), ("Yoruba", "yo"), ("Zulu", "zu")])

In [3]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
invalid_mapping = {
    '.': "", # remove
    '’': "'", # substitute to '
    '»': "", # remove
    '«': "", # remove
    ',': "", # remove 
    '&': "ai", # Sindhi - equal to ۽ and pronounced as ãĩ̯ (from wikipedia Sindhi)
    '-': "", # remove 
    '”': "", # remove
    '²': "", # remove
    '–': "", # remove
    '‘': "'", # substitute to '
    '—': "", # remove
    '·': "", # remove
    ';': "", # remove 
    '(': "", # remove 
    ')': "", # remove
    'ઃ': "", # remove 
    '€': "⊗", # delete row
    '?': "", # remove 
    'õ': "o", # substitute to o
    '・': " ", # substitute to spacing
    ':': "", # remove
    'í': 'i', # substitute to i
    '\x92': "'", # substitute to '
    '\x96': " ", # substitute to spacing
    '\x93': "", # remove
    '\x94': "", # remove
    'é': "e", # substitute to e 
    '1': "⊗", # delete row
    '⁄': "⊗", # delete row
    '4': "⊗", # delete row
    '•': "⊗", # delete row 
    '\x8d': "", # remove 
    '©': "⊗", # delete row
    '£': "", # remove 
    '\x97': "", # remove 
    'ၤ': "⊗", # delete row
    'ୗ': "⊗", # delete row
}
invalid_transmap = str.maketrans(invalid_mapping)

In [5]:
def build_romanizer():
    romanizer = ur.Uroman() # Usage: romanizer.romanize_string(text, lcode=iso)
    kks = pykakasi.kakasi()
    return romanizer, kks

In [6]:
def clean(text):
    
    text = text.lower()
    
    text = unicodedata.normalize('NFKC', text)
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    return text

In [None]:
def display_random_transcriptions_and_audio(df, split):
    _, filename, raw_transcription, _, _, _, _, _, _, lang, _ = df.sample(n=1).values[0]
    print(f"Raw Transcription: {raw_transcription}")
    print(f"Romanized Form: {romanizer.romanize_string(raw_transcription, iso=lang_to_iso[lang])}")
        
    filepath = os.path.join('path/to/audio/files', split, filename)
    
    display(Audio(filepath))

In [None]:
fleurs_audio_dir = 'path/to/audio/files'
romanizer, kks = build_romanizer()
with open('fleurs_to_iso3.json', 'r') as f:
    lang_to_iso = json.load(f)

In [10]:
assert lang_to_iso.keys()==_FLEURS_LANG_TO_ID.keys()

In [None]:
train = pd.read_csv('path/to/csv/files/train.csv')
dev = pd.read_csv('path/to/csv/files/dev.csv')
test = pd.read_csv('path/to/csv/files/test.csv')

In [12]:
train_with_brackets = train[train['raw transcription'].str.contains(r'[\(\[\{].*?[\)\]\}]', regex=True)]
train_with_numbers = train[train['raw transcription'].str.contains(r'\d', regex=True)]
train_with_brackets_no_numbers = train_with_brackets[~train_with_brackets['raw transcription'].str.contains(r'\d', regex=True)]

In [14]:
display_random_transcriptions_and_audio(train_with_brackets_no_numbers, 'train')

Raw Transcription: Ameerika Ühendriikide Geoloogiateenistusele (USGS) ja riiklikule maavärinate teabekeskusele pole laekunud vahetuid teateid kahjustuste kohta.
Romanized Form: Ameerika Uehendriikide Geoloogiateenistusele (USGS) ja riiklikule maavaerinate teabekeskusele pole laekunud vahetuid teateid kahjustuste kohta.


In [15]:
train_roman = pd.DataFrame()

ids = []
filepaths = []
raw_transcriptions = []
transcriptions = []
romans = []
genders = []
durations = []
language_ids = []
languages = []
language_groups = []

for index, row in tqdm(train.iterrows(), total=len(train), desc="Processing rows"):
    id_, filename, raw_transcription, transcription, _, _, gender, duration, language_id, language, language_group = row
    transcription = clean(transcription)
    if language == 'Japanese':
        roman = ' '.join([i['hepburn'] for i in kks.convert(transcription)])
    elif language == 'Mandarin Chinese':
        roman = ' '.join(lazy_pinyin(transcription)).strip()
    elif language == 'Cantonese Chinese':
        roman = ' '.join([i[-1] for i in characters_to_jyutping(transcription) if i[-1] is not None])
        roman = re.sub(r'\d', '', roman)
    else:
        try:
            roman = romanizer.romanize_string(transcription, iso=lang_to_iso[language])
        except:
            roman = ''
    
    roman = roman.lower()
    roman = roman.translate(invalid_transmap)
    roman = re.sub(r'\s+', ' ', roman)
    roman = roman.strip()
    
    ids.append(id_)
    filepaths.append(os.path.join(fleurs_audio_dir, 'train', filename))
    raw_transcriptions.append(raw_transcription)
    transcriptions.append(transcription)
    romans.append(roman)
    genders.append(gender)
    durations.append(duration)
    language_ids.append(language_id)
    languages.append(language)
    language_groups.append(language_group)

train_roman['id'] = ids
train_roman['file path'] = filepaths
train_roman['raw transcription'] = raw_transcriptions
train_roman['transcription'] = transcriptions
train_roman['roman'] = romans
train_roman['duration'] = durations
train_roman['language'] = languages
train_roman['language id'] = language_ids
train_roman['language group'] = language_groups
train_roman['gender'] = genders

Processing rows:   0%|          | 0/248118 [00:00<?, ?it/s]

Processing rows: 100%|██████████| 248118/248118 [12:06<00:00, 341.46it/s] 


In [16]:
train_roman = train_roman[~train_roman['raw transcription'].str.contains(r'\d', regex=True)].reset_index(drop=True)

In [17]:
train_roman = train_roman[train_roman['roman'].str.fullmatch(r"[a-z\s']*")].reset_index(drop=True)

In [18]:
all_text = ''.join(train_roman['roman'].dropna())

char_frequency = Counter(all_text)

sorted_char_frequency = dict(sorted(char_frequency.items(), key=lambda x: x[1], reverse=True))

sorted_chars = sorted_char_frequency.keys()

print(sorted(sorted_chars))

[' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [21]:
train_roman[train_roman['roman']=='']

Unnamed: 0,id,file path,raw transcription,transcription,roman,duration,language,language id,language group,gender


In [22]:
train_roman.isna().any().any()

False

In [24]:
train_roman.to_csv('../fleurs-r/train.csv', index=False)

In [25]:
dev_roman = pd.DataFrame()

ids = []
filepaths = []
raw_transcriptions = []
transcriptions = []
romans = []
genders = []
durations = []
language_ids = []
languages = []
language_groups = []

for index, row in tqdm(dev.iterrows(), total=len(dev), desc="Processing rows"):
    id_, filename, raw_transcription, transcription, _, _, gender, duration, language_id, language, language_group = row
    transcription = clean(transcription)
    if language == 'Japanese':
        roman = ' '.join([i['hepburn'] for i in kks.convert(transcription)])
    elif language == 'Mandarin Chinese':
        roman = ' '.join(lazy_pinyin(transcription)).strip()
    elif language == 'Cantonese Chinese':
        roman = ' '.join([i[-1] for i in characters_to_jyutping(transcription) if i[-1] is not None])
        roman = re.sub(r'\d', '', roman)
    else:
        try:
            roman = romanizer.romanize_string(transcription, iso=lang_to_iso[language])
        except:
            roman = ''
    
    roman = roman.lower()
    roman = roman.translate(invalid_transmap)
    roman = re.sub(r'\s+', ' ', roman)
    roman = roman.strip()
    
    ids.append(id_)
    filepaths.append(os.path.join(fleurs_audio_dir, 'dev', filename))
    raw_transcriptions.append(raw_transcription)
    transcriptions.append(transcription)
    romans.append(roman)
    genders.append(gender)
    durations.append(duration)
    language_ids.append(language_id)
    languages.append(language)
    language_groups.append(language_group)

dev_roman['id'] = ids
dev_roman['file path'] = filepaths
dev_roman['raw transcription'] = raw_transcriptions
dev_roman['transcription'] = transcriptions
dev_roman['roman'] = romans
dev_roman['duration'] = durations
dev_roman['language'] = languages
dev_roman['language id'] = language_ids
dev_roman['language group'] = language_groups
dev_roman['gender'] = genders

Processing rows: 100%|██████████| 31378/31378 [01:32<00:00, 340.08it/s] 


In [26]:
dev_roman = dev_roman[~dev_roman['raw transcription'].str.contains(r'\d', regex=True)].reset_index(drop=True)

In [27]:
dev_roman = dev_roman[dev_roman['roman'].str.fullmatch(r"[a-z\s']*")].reset_index(drop=True)

In [28]:
all_text = ''.join(dev_roman['roman'].dropna())

char_frequency = Counter(all_text)

sorted_char_frequency = dict(sorted(char_frequency.items(), key=lambda x: x[1], reverse=True))

sorted_chars = sorted_char_frequency.keys()

print(sorted(sorted_chars))

[' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [29]:
dev_roman[dev_roman['roman']=='']

Unnamed: 0,id,file path,raw transcription,transcription,roman,duration,language,language id,language group,gender


In [30]:
dev_roman.isna().any().any()

False

In [33]:
dev_roman.to_csv('../fleurs-r/dev.csv', index=False)

In [32]:
test_roman = pd.DataFrame()

ids = []
filepaths = []
raw_transcriptions = []
transcriptions = []
romans = []
genders = []
durations = []
language_ids = []
languages = []
language_groups = []

for index, row in tqdm(test.iterrows(), total=len(test), desc="Processing rows"):
    id_, filename, raw_transcription, transcription, _, _, gender, duration, language_id, language, language_group = row
    transcription = clean(transcription)
    if language == 'Japanese':
        roman = ' '.join([i['hepburn'] for i in kks.convert(transcription)])
    elif language == 'Mandarin Chinese':
        roman = ' '.join(lazy_pinyin(transcription)).strip()
    elif language == 'Cantonese Chinese':
        roman = ' '.join([i[-1] for i in characters_to_jyutping(transcription) if i[-1] is not None])
        roman = re.sub(r'\d', '', roman)
    else:
        try:
            roman = romanizer.romanize_string(transcription, iso=lang_to_iso[language])
        except:
            roman = ''
    
    roman = roman.lower()
    roman = roman.translate(invalid_transmap)
    roman = re.sub(r'\s+', ' ', roman)
    roman = roman.strip()
    
    ids.append(id_)
    filepaths.append(os.path.join(fleurs_audio_dir, 'test', filename))
    raw_transcriptions.append(raw_transcription)
    transcriptions.append(transcription)
    romans.append(roman)
    genders.append(gender)
    durations.append(duration)
    language_ids.append(language_id)
    languages.append(language)
    language_groups.append(language_group)

test_roman['id'] = ids
test_roman['file path'] = filepaths
test_roman['raw transcription'] = raw_transcriptions
test_roman['transcription'] = transcriptions
test_roman['roman'] = romans
test_roman['duration'] = durations
test_roman['language'] = languages
test_roman['language id'] = language_ids
test_roman['language group'] = language_groups
test_roman['gender'] = genders

Processing rows: 100%|██████████| 70684/70684 [03:34<00:00, 330.05it/s] 


In [34]:
test_roman = test_roman[~test_roman['raw transcription'].str.contains(r'\d', regex=True)].reset_index(drop=True)

In [35]:
test_roman = test_roman[test_roman['roman'].str.fullmatch(r"[a-z\s']*")].reset_index(drop=True)

In [36]:
all_text = ''.join(test_roman['roman'].dropna())

char_frequency = Counter(all_text)

sorted_char_frequency = dict(sorted(char_frequency.items(), key=lambda x: x[1], reverse=True))

sorted_chars = sorted_char_frequency.keys()

print(sorted(sorted_chars))

[' ', "'", 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [37]:
test_roman[test_roman['roman']=='']

Unnamed: 0,id,file path,raw transcription,transcription,roman,duration,language,language id,language group,gender


In [38]:
test_roman.isna().any().any()

False

In [39]:
test_roman.to_csv('../fleurs-r/test.csv', index=False)