In this Notebook we will:

* extract data from the [Mozilla Common Voice](https://commonvoice.mozilla.org/it?gclid=Cj0KCQiA2-2eBhClARIsAGLQ2RlkVJtTFkEemoK3FvlpTxtFwuXvAHGOHadvXjzcbrx-R2Jw9eNdES8aAhcPEALw_wcB) corpus through the Hugging Face Hub
* build a multilingual corpus
* transcribe the corpus through the [WebMAUS Basic](https://clarin.phonetik.uni-muenchen.de/BASWebServices/interface/WebMAUSBasic) tool


Our corpus will consist of approximately 20 hours of speech data.

### Installings

In [1]:
%%capture 
!pip install datasets==2.1
!pip install transformers==4.18 
!pip install huggingface_hub==0.5.1 
!pip install torchaudio==0.11  
!pip install librosa 
!pip install jiwer   
!git config --global credential.helper store 
!apt install git-lfs
!%pip install sox

In [8]:
import pandas as pd
import re
import torch
import json
import os
import numpy as np
from typing import Any, Dict, List, Optional, Union
from IPython.display import display, HTML
from transformers import Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer, AutoModelForCTC, Wav2Vec2Processor
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from datasets.utils.version import Version
from datasets import load_dataset, load_metric, Audio
from torch import Tensor
import sys
import argparse
import requests
import glob

### Load Common Voice datasets

The corpus will be composed of Italian, Spanish and French data; if you want to work with other languages change the language id according to the data that you need. 


In [9]:
# Processing functions

def computeTotLen(dataset):

  """computes length of the corpus in seconds"""

  len_audio_fs = [ ]
  for el in dataset['audio']:
    len_audio = len(el['array'])/el['sampling_rate']
    len_audio_fs.append(len_audio)
  tot = sum(list(len_audio_fs))
  # print(len_audio_fs)
  return tot

# functions to clean the text 

chars_to_remove_regex = '[\,\#\?\.\!\-\;\:\"\“\%\‘\”\�\°\(\)\–\…\\\[\]\«\»\\\/\^\<\>\~\_\-\¿\¡\—]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch

def replace_hatted_characters(batch):
    batch["sentence"] = re.sub('[’]', "'", batch["sentence"])
    return batch

#### Italian data

In [None]:
print('*------- working on IT dataset -------*')

# let's load the dataset and get rid of some columns

common_voice_trainIT = load_dataset("mozilla-foundation/common_voice_11_0", 'it' , split="train[:7000]")
common_voice_trainIT = common_voice_trainIT.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])


common_voice_testIT = load_dataset("mozilla-foundation/common_voice_11_0", 'it', split="test[:1400]")
common_voice_testIT = common_voice_testIT.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])


common_voice_validationIT = load_dataset("mozilla-foundation/common_voice_11_0", 'it', split="validation[:1400]")
common_voice_validationIT = common_voice_validationIT.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

*------- working on IT dataset -------*


AttributeError: 'StreamingDownloadManager' object has no attribute 'is_streaming'

In [None]:
# check corpus length

print('dataset initial len: ',len(common_voice_trainIT), len(common_voice_testIT), len(common_voice_validationIT))

ITlen_pre_filterTR = computeTotLen(common_voice_trainIT)
ITlen_pre_filterTST = computeTotLen(common_voice_testIT)
ITlen_pre_filterVAL = computeTotLen(common_voice_validationIT)

print(f'IT dataset len in sec pre filter TRAIN: {ITlen_pre_filterTR} - TEST: {ITlen_pre_filterTST} - VAL {ITlen_pre_filterVAL}')

In [None]:
print('*------- cleaning text -------*')

# remove special chrs
common_voice_trainIT = common_voice_trainIT.map(remove_special_characters)
common_voice_testIT = common_voice_testIT.map(remove_special_characters)
common_voice_validationIT = common_voice_validationIT.map(remove_special_characters)


# replace hatted chrs
common_voice_trainIT = common_voice_trainIT.map(replace_hatted_characters)
common_voice_testIT = common_voice_testIT.map(replace_hatted_characters)
common_voice_validationIT = common_voice_validationIT.map(replace_hatted_characters)

print('----------cleaning text done-------*')

In [None]:
# let's extract files that are max 7 seconds long

print('*------- filtering by length -------*')

max_input_length_in_sec = 7.0

common_voice_trainIT = common_voice_trainIT.filter(lambda x: len(x['audio']['array'])/x['audio']['sampling_rate'] < max_input_length_in_sec, num_proc=5)
common_voice_testIT = common_voice_testIT.filter(lambda x: len(x['audio']['array'])/x['audio']['sampling_rate'] < max_input_length_in_sec, num_proc=5)
common_voice_validationIT = common_voice_validationIT.filter(lambda x: len(x['audio']['array'])/x['audio']['sampling_rate'] < max_input_length_in_sec, num_proc=5)


print(f'number of files: TRAIN: {len(common_voice_trainIT)}, TEST:  {len(common_voice_testIT)}, VAL: {len(common_voice_validationIT)}')


ITlen_post_filterTR = computeTotLen(common_voice_trainIT)
ITlen_post_filterTST = computeTotLen(common_voice_testIT)
ITlen_post_filterVAL = computeTotLen(common_voice_validationIT)

print(f'IT dataset len in sec post filter TRAIN: {ITlen_post_filterTR} - TEST: {ITlen_post_filterTST} - VAL {ITlen_post_filterVAL}')


print('*-------- audio filtering done --------*')


#### Spanish data

In [None]:
print('*------- working on ES dataset -------*')

common_voice_trainES = load_dataset("mozilla-foundation/common_voice_11_0", 'es' , split="train[:7000]")
common_voice_trainES = common_voice_trainES.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale",
"segment", "up_votes"])


common_voice_testES = load_dataset("mozilla-foundation/common_voice_11_0", 'es' , split="test[:1400]")
common_voice_testES = common_voice_testES.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])


common_voice_validationES = load_dataset("mozilla-foundation/common_voice_11_0", 'es', split="validation[:1400]")
common_voice_validationES = common_voice_validationES.remove_columns(["accent", "age", "client_id", "down_votes", "gender",
"locale", "segment", "up_votes"])


print(len(common_voice_trainES), len(common_voice_testES), len(common_voice_validationES))

ESlen_pre_filterTR = computeTotLen(common_voice_trainES)
ESlen_pre_filterTST = computeTotLen(common_voice_testES)
ESlen_pre_filterVAL = computeTotLen(common_voice_validationES)

print(f'ES dataset len in sec pre filter TRAIN: {ESlen_pre_filterTR} - TEST: {ESlen_pre_filterTST} - VAL {ESlen_pre_filterVAL}')


print('*------- cleaning text -------*')

# remove special chrs
common_voice_trainES = common_voice_trainES.map(remove_special_characters)
common_voice_testES = common_voice_testES.map(remove_special_characters)
common_voice_validationES = common_voice_validationES.map(remove_special_characters)


# replace hatted chrs
common_voice_trainES = common_voice_trainES.map(replace_hatted_characters)
common_voice_testES = common_voice_testES.map(replace_hatted_characters)
common_voice_validationES = common_voice_validationES.map(replace_hatted_characters)

print('----------cleaning text done-------*')


## --------------- AUDIO LEN FILTER

print('*------- filtering by length -------*')


# TRAIN

common_voice_trainES = common_voice_trainES.filter(lambda x: len(x['audio']['array'])/x['audio']['sampling_rate'] < max_input_length_in_sec, num_proc=5)

# TEST

common_voice_testES = common_voice_testES.filter(lambda x: len(x['audio']['array'])/x['audio']['sampling_rate'] < max_input_length_in_sec, num_proc=5)

# # VAL

common_voice_validationES = common_voice_validationES.filter(lambda x: len(x['audio']['array'])/x['audio']['sampling_rate'] < max_input_length_in_sec, num_proc=5)


print(f'number of files: TRAIN: {len(common_voice_trainES)}, TEST:  {len(common_voice_testES)}, VAL: {len(common_voice_validationES)}')


ESlen_post_filterTR = computeTotLen(common_voice_trainES)
ESlen_post_filterTST = computeTotLen(common_voice_testES)
ESlen_post_filterVAL = computeTotLen(common_voice_validationES)

print(f'ES dataset len in sec post filter TRAIN: {ESlen_post_filterTR} - TEST: {ESlen_post_filterTST} - VAL {ESlen_post_filterVAL}')


print('*-------- audio filtering done --------*')


#### French data

In [None]:
print('*------- working on FR dataset -------*')

common_voice_trainFR = load_dataset("mozilla-foundation/common_voice_11_0", 'fr' , split="train[:7000]")
common_voice_trainFR = common_voice_trainFR.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale",
"segment", "up_votes"])


common_voice_testFR = load_dataset("mozilla-foundation/common_voice_11_0", 'fr', split="test[:1400]")
common_voice_testFR = common_voice_testFR.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])


common_voice_validationFR = load_dataset("mozilla-foundation/common_voice_11_0", 'fr', split="validation[:1400]")
common_voice_validationFR = common_voice_validationFR.remove_columns(["accent", "age", "client_id", "down_votes", "gender",
"locale", "segment", "up_votes"])


print(len(common_voice_trainFR), len(common_voice_testFR), len(common_voice_validationFR))

FRlen_pre_filterTR = computeTotLen(common_voice_trainFR)
FRlen_pre_filterTST = computeTotLen(common_voice_testFR)
FRlen_pre_filterVAL = computeTotLen(common_voice_validationFR)

print(f'FR dataset len in sec pre filter TRAIN: {FRlen_pre_filterTR} - TEST: {FRlen_pre_filterTST} - VAL {FRlen_pre_filterVAL}')


print('*------- cleaning text -------*')

# remove special chrs
common_voice_trainFR = common_voice_trainFR.map(remove_special_characters)
common_voice_testFR = common_voice_testFR.map(remove_special_characters)
common_voice_validationFR = common_voice_validationFR.map(remove_special_characters)


# replace hatted chrs
common_voice_trainFR = common_voice_trainFR.map(replace_hatted_characters)
common_voice_testFR = common_voice_testFR.map(replace_hatted_characters)
common_voice_validationFR = common_voice_validationFR.map(replace_hatted_characters)

print('----------cleaning text done-------*')


## --------------- AUDIO LEN FILTER

print('*------- filtering by length -------*')


# TRAIN

common_voice_trainFR = common_voice_trainFR.filter(lambda x: len(x['audio']['array'])/x['audio']['sampling_rate'] < max_input_length_in_sec, num_proc=5)

# TEST

common_voice_testFR = common_voice_testFR.filter(lambda x: len(x['audio']['array'])/x['audio']['sampling_rate'] < max_input_length_in_sec, num_proc=5)

# # VAL

common_voice_validationFR = common_voice_validationFR.filter(lambda x: len(x['audio']['array'])/x['audio']['sampling_rate'] < max_input_length_in_sec, num_proc=5)


print(f'number of files: TRAIN: {len(common_voice_trainFR)}, TEST:  {len(common_voice_testFR)}, VAL: {len(common_voice_validationFR)}')


FRlen_post_filterTR = computeTotLen(common_voice_trainFR)
FRlen_post_filterTST = computeTotLen(common_voice_testFR)
FRlen_post_filterVAL = computeTotLen(common_voice_validationFR)

print(f'FR dataset len in sec post filter TRAIN: {FRlen_post_filterTR} - TEST: {FRlen_post_filterTST} - VAL {FRlen_post_filterVAL}')


print('*-------- audio filtering done --------*')

### Automatic phonological transcriptions - WebMAUS Basic

In [None]:
def filter_length(audio_dataset):

    """Returns a list of dictionaries in which filename, length and text of each file in the corpus are stored"""
    
    list_ds = [ ]

    for el in audio_dataset:    
        audio_info = {'filename': el['audio']['path'], 'length': len(el['audio']['array'])/el['audio']['sampling_rate'], 'text': el['sentence']}
        list_ds.append(audio_info)

    return list_ds


def remove_path(list_ds):

    """Removes path from filename"""
    for audio in list_ds:
        audio['filename'] = os.path.split(audio['filename'])[-1]
    return list_ds  


def filename_list(info_list):
    filenames_l = [ ]
    for el in info_list:
        filenames_l.append(el['filename'])
    print(len(filenames_l))
    return filenames_l


def sent2textf(info_list):

    """stores the text of each file of the corpus in current directory"""

    for audio_f in info_list:
        filename = re.sub('.mp3', '', audio_f['filename'])
    # print(type(filename))
    # print(filename)
        content = audio_f['text']
        with open(filename+'.txt', 'w') as f:
            f.write(content)


def webMAUS_processing(txt_path, out_format, out_path, info_dict, language = "ita-IT"):

    """takes audio and text file from and returns a csv with phonological and phonetic transcription"""

    url = "https://clarin.phonetik.uni-muenchen.de/BASWebServices/services/runMAUSBasic"
    counter = 0
    for filen in info_dict:
        filename = os.path.splitext(filen['filename'])[0]
        print(filename)
        textfile = txt_path+filename+'.txt'
        print(textfile)
        mp3file = glob.glob(os.path.dirname(filen['path'])+'/*/')[0]+filename+'.mp3'
        print(mp3file)
        formdata = {
            "SIGNAL": (os.path.split(mp3file)[-1], open(mp3file, "rb"), "audio/x-mp3"),
            "TEXT": (os.path.split(textfile)[-1], open(textfile, "r"), "text/txt"),
            
            "LANGUAGE": (None, language),
            "OUTFORMAT": (None, out_format)
        }

        res = requests.post(url, files=formdata)

        lista_out = [ ]
        lista_out.append(res.text)

    # Driver Code

        for str_obj in lista_out:
            string = str_obj
            link = Find(string) # returns link in a list

        link = link[0] # flat

        r = requests.get(bbb, allow_redirects=True)
        open(out_path+filename+'.'+out_format, 'wb').write(r.content) 
        counter +=1
    print('processed files: ', counter)

(1) store the information about each file in a list of dictionaries

In [None]:
info_common_voice_trainIT = filter_length(common_voice_trainIT) 
info_common_voice_trainIT = remove_path(info_common_voice_trainIT) 

info_common_voice_testIT = filter_length(common_voice_testIT)
info_common_voice_testIT = remove_path(info_common_voice_testIT)

info_common_voice_validationIT = filter_length(common_voice_validationIT)
info_common_voice_validationIT = remove_path(info_common_voice_validationIT)

(2) create and set a directory to store the .txt files corresponing to the transcriptions. <br/>
It is convenient to store each subset in different directories 

In [None]:
!mkdir /my_txt_directory_train
!mkdir /my_txt_directory_test
!mkdir /my_txt_directory_val

(3) store the sentences of the corpus in separate .txt files and save them in a directory. 

In [None]:
!pwd my_new_directory_train
sent2textf(info_common_voice_trainIT)

!pwd my_new_directory_test
sent2textf(info_common_voice_testIT)

!pwd my_new_directory_val
sent2textf(info_common_voice_validationIT)


(4) Call the WebMAUS_processing function for each to store the csv with the transcriptions corresponding to each file of the corpus in the out_directory.

Note: this process can take a while. If you're working with a large amount of files if could be convenient to download the audio and the text files on your local machine and generate the csv directly form the WebMAUS Basic interface.


In [None]:
webMAUS_processing('/my_txt_directory_train/', 'csv', '/my_csv_directory_train/', info_common_voice_trainIT)
webMAUS_processing('/my_txt_directory_test/', 'csv', '/my_csv_directory_test/', info_common_voice_testIT)
webMAUS_processing('/my_txt_directory_val/', 'csv', '/my_csv_directory_val/', info_common_voice_validationIT)

### csv file processing

In [None]:
# Phonological transcription is just the set of a list 

def clean_transcr(seq):

    """Removes duplicates from a list while preserving order - since the phonological trascription
    in MAUS CSV is the same word repeated in several cells representing the duration of the word
    in time"""

    seen = set()
    seen_add = seen.add
    transcr = [x for x in seq if not (x in seen or seen_add(x))]
    return transcr



def get_transcriptions():


    """Process all the CSV files in the cwd to extract the phonological and phonetic transcription"""

    # make sure to be in the dir where the csv files are stored
    files = [f for f in os.listdir('.') if os.path.isfile(f)]
    print(len(files))
   
    df_list = [ ]
    for csv in files: 
        dd = { 'filename' : None, 'phonl_tr' : None, 'phont_tr' : None }
        df = pd.read_csv(csv, sep=';')

        # phonological transcription
        filename = os.path.splitext(csv)[0]
        phonol_temp = [ ]
        phonol_sent = [ ]
        for word in df['KAN']:
            if isinstance(word, str) == True:
                no_space = "".join(word.split())
                phonol_temp.append(no_space)     
            else:
                pass

            clean_phonol_s = clean_transcr(phonol_temp)
            phonol_sent = ' '.join(clean_phonol_s)
     
        # phonetic transcription 
        phon_temp = [ ]
        word = ""
        current_token = None
        for index, row in df.iterrows():   
            if row['TOKEN'] != -1:
                # Update current token and reset word
                if row['TOKEN'] != current_token:
                    if word != "":
                        phon_temp.append(word)
                    current_token = row['TOKEN']
                    word = ""
                # Append to form the word
                word = word + row['MAU']

                if index == len(df)-1:  
                    phon_temp.append(word)

            else:
                if index == len(df)-1:  
                    phon_temp.append(word) 
                    
            phon_sent = ' '.join(phon_temp)
        
        dd['filename'] = filename
        dd['phonl_tr'] = phonol_sent
        dd['phont_tr'] = phon_sent

        # print('phonological: ', phonol_sent, 'phonetic: ', phon_sent)
        df_list.append(dd)
       
    return df_list


def transcriptions_df(setname, out_dir):
    transcr_list = get_transcriptions()
    transcr_df =  pd.DataFrame(transcr_list)
    with open(f'{out_dir}{setname}df_transcriptions.pkl', "wb") as fp:   #Pickling
    pkl.dump(transcr_df, fp)



In [None]:
transcriptions_df('it_train')
transcriptions_df('it_test')
transcriptions_df('it_val')

### Transcriptions cleaning

Acronyms, abbreviations, any non-alphabetic character and non-spelled numbers are problematic elements because the graphemic correspondence do not match equally with what is actually pronounced. Foreign words, proper nouns, toponyms, and onomatopoeias are also a source of issues: they may be composed by an unusual grapheme sequence for a certain language and, in addition, some graphemes may not have a correspondent in the phoneme inventory that is used to define the transcription rules implemented in the software.

In [None]:
def transcrCleanerMltLng(string, lang = 'es'):

  """takes in a string, gets rid of the spaces inbetween words and of non A-Z characters,
  puts a space in between each character and returns the string """

  
  chars_to_remove_regex = r'<usb>'
  
  no_weird1 = re.sub(chars_to_remove_regex, '', string)

  chars_to_remove_regex = r'<p:>'
  no_weird2 = re.sub(chars_to_remove_regex, '', no_weird1)

  chars_to_remove_regex = '_'
  no_weird3 = re.sub(chars_to_remove_regex, ' ', no_weird2)

  chars_to_remove_regex = '[#_<>]'
  no_weird4 = re.sub(chars_to_remove_regex, '', no_weird3)
  if '?' in no_weird4:
     no_weird4 = no_weird4.replace('?', '')
  
  if 'q' in no_weird4:
     no_weird4 = no_weird4.replace('q', 'k')


  if 'ù' in no_weird4:
     no_weird4 = no_weird4.replace('ù', 'u')

  if lang == 'it':
    if 'ch' in no_weird4:
        no_weird4 = no_weird4.replace('ch', 'k')

    no_weird4 = re.sub(r'[^a-zA-Z]+', ' ', no_weird4).strip()

  elif lang == 'fr':
    if 'r' in no_weird4:
      print('r found')
      no_weird4 = no_weird4.replace('r', 'R')
      
    no_weird4 = re.sub(r'[^a-zA-Z29@~]+', ' ', no_weird4).strip()

  return no_weird4


def find_acronyms(batch):

  """finds words without vowels in transcriptions. Returns a list with the
  index of the example in the corpus with the problematic transcription so that is possible
  to retrieve it and check the context"""

  all_prob_w = [ ]
  for i, sent in enumerate(batch['phonl_tr']):
    x = sent.split()
    # print(x)
    prob_words = [word for word in x if len(word) > 1 and not any(v in word for v in nucleus_scw)] #and len(word) >1]
    if prob_words != [ ]:
      all_prob_w.append([i, prob_words])
    else:
      pass
  
  return all_prob_w

def most_frequent(List):

    """finds most frequent acronym/abbreviation"""
    
    return max(set(List), key = List.count)

def affricates(str_chrs):
  if "d d z" in str_chrs:
    x = str_chrs.replace("d d z", 'ddz')
  else:
    x = str_chrs
  if "d d Z"  in x:
    x = x.replace("d d Z",  'ddZ')
  else:
    x = x 
  if "d z"  in x:
    x = x.replace("d z",  'dz')
  else:
    x = x 
  if "d Z"  in x:
    x = x.replace("d Z",  'dZ')
  else:
    x = x 
  if "t t s"  in x:
    x = x.replace("t t s",  'tts')
  else:
    x = x 
  if "t t S"  in x:
    x = x.replace("t t S",  'ttS')
  else:
    x = x 

  if "t s"  in x:
    x = x.replace("t s",  'ts')
  else:
    x = x 
  if "t S"  in x:
    x = x.replace("t S",  'tS')
  else:
    x = x 

  return x

def affricates_nasV(str_chrs):
  if "d d z" in str_chrs:
    x = str_chrs.replace("d d z", 'ddz')
  else:
    x = str_chrs
  if "d d Z"  in x:
    x = x.replace("d d Z",  'ddZ')
  else:
    x = x 
  if "d z"  in x:
    x = x.replace("d z",  'dz')
  else:
    x = x 
  if "d Z"  in x:
    x = x.replace("d Z",  'dZ')
  else:
    x = x 
  if "t t s"  in x:
    x = x.replace("t t s",  'tts')
  else:
    x = x 
  if "t t S"  in x:
    x = x.replace("t t S",  'ttS')
  else:
    x = x 

  if "t s"  in x:
    x = x.replace("t s",  'ts')
  else:
    x = x 
  if "t S"  in x:
    x = x.replace("t S",  'tS')
  else:
    x = x 

  # nasals

  if "e ~"  in x:
    x = x.replace("e ~",  'e~')
  else:
    x = x 
  if "a ~"  in x:
    x = x.replace("a ~",  'a~')
  else:
    x = x 

  if "o ~"  in x:
    x = x.replace("o ~",  'o~')
  else:
    x = x 

  if "9 ~"  in x:
    x = x.replace("9 ~",  '9~')
  else:
    x = x  

  return x


def fix_transcriptions(sent, acr, abbr, oth):

  """Takes a string and replaces the wrong transcriptions relying on a
  language-dependent spelling dictionary, abbreviation dictionary and 
  custom dictionary with inconsistencies"""

  fixed_sent = [ ]
  # for i, sent in enumerate(batch['phonl_tr']):
  #   x = sent.split()
  s_list = sent.split()

  for word in s_list:
    if word in oth:
      fixed_tr = re.sub(word, oth[word], word)
      fixed_sent.append(fixed_tr)
    elif word in abbr:
      fixed_tr = re.sub(word, abbr[word], word)
      fixed_sent.append(fixed_tr)
    elif len(word) > 1 and not any(v in word for v in nucleus_scw):
      expl_acr = [ ]
      word_chrs = " ".join(word)
      word_chrs = affricates_nasV(word_chrs)
      # print(word_chrs)
      w_c_list = word_chrs.split()
      # print(w_c_list)
      for chr in w_c_list:
        # print(chr)    
        fixed_ch = re.sub(chr, acr[chr], chr)
        expl_acr.append(fixed_ch)
      
      # print(expl_acr)
      fixed_tr = ''.join(expl_acr)
      fixed_sent.append(fixed_tr)


    else:
      fixed_tr = word
      fixed_sent.append(fixed_tr)
    
  fixed_sent = ' '.join(fixed_sent) 
  
  
  return fixed_sent

#### Italian

In [None]:
# cleaning IT transcriptions

# train
IT_train_df['phonl_tr'] = IT_train_df['phonl_tr'].apply(transcrCleanerMltLng, args=['it'])
IT_train_df['phont_tr'] = IT_train_df['phont_tr'].apply(transcrCleanerMltLng, args=['it'])

#test
IT_test_df['phonl_tr'] = IT_test_df['phonl_tr'].apply(transcrCleanerMltLng, args=['it'])
IT_test_df['phont_tr'] = IT_test_df['phont_tr'].apply(transcrCleanerMltLng, args=['it'])

#val
IT_val_df['phont_tr'] = IT_val_df['phont_tr'].apply(transcrCleanerMltLng, args=['it'])
IT_val_df['phonl_tr'] = IT_val_df['phonl_tr'].apply(transcrCleanerMltLng, args=['it'])

# finding problematic word transcriptions

ITacr_train = find_acronyms(IT_train_df) #219
ITacr_test = find_acronyms(IT_test_df) #55
ITacr_val = find_acronyms(IT_val_df) #50

# set of italian problematic transcriptions

all_probs_s = list(set(all_probs_flat)) # 89 items
all_probs_s

# checking for a specific transcription on the corpus by index

print(IT_train_df['phonl_tr'][14])
print(IT_train_df['filename'][22])

In [None]:
# language-dependent vocabularies to fix the issues

IT_spelling = {'b' : 'bi', 
               'tS' : 'tSi', 
               'ttS' : 'tSi',
               'd' : 'di',
               'f' : 'effe',
               'dZ' : 'dZi',
               'dz' : 'dzi',
               'g' : 'dZi', 
               'h' : 'akka', 
               'j' : 'dZei',
               'k' : 'kappa',
               'l' : 'elle',
               'm' : 'emme',
               'n' : 'enne',
               'p' : 'pi',
               'q' : 'ku',
               'r' : 'erre',
               's' : 'esse',
               't' : 'ti',
               'v' : 'vu',
               'x' : 'iks',
               'y' : 'ipsilon',
               'z' : 'zeta',
               'w' : 'vu'}

IT_abbr = {'mr' : 'mister',
            'tv' : 'tivu',
           'dr' : 'dottor',
           'ktml' : 'akkatiemmelle',
           'tStSk': 'tSitSiakka',
           'st' : 'seint',
           'vs' : 'versus',
           'mrs' : 'missis',
           'jr' : 'dZunior',
           'dj' : 'didZei',
           'km' : 'kilometri',
           'ms' : 'miss',
           'dzld' : 'old'}

           

IT_others = {'skdZps' : 'skajp',
             'diellecistoria' : 'dielletSistorja',
             'btdZps' : 'bitajp',
             'ktdZps' : 'tSitajp',
             'sdZbersorElle' : 'sajbersorElle',
             'sdZbErbullizmo': 'sajbErbullizmo',
             'sdZkes' : 'sajkes',
             'sdZrano' : 'sirano',
             'sdZberterrorizmo' : 'sajberterrorizmo',
             'sdZberpunk' : 'sajberpunk',
             'sdZber' : 'sajber',
             'sdZamalan' : 'Samalan',
             'kk' : 'kejkej',
             'kmr' : 'kmer',
             'pk' : 'pikei',
             'kz' : 'kozo',
             'nsjk' : 'ansik',
             'dzrt' : 'Sort',
             'kl' : 'akkaelle',
             'spdZksr' : 'spaiker',
             'cfr' : 'confronta',
             'mk' : 'emmekappa',
             'gmbk' : 'dZiembieitS',
             'nOn' : 'non',
             'kmmevui' : 'kiwi',
             'krtsisttof' : 'kristof',
             'pdZrmont' : 'pirmont',
             'wbleifozberi' : 'uebleifosberi',
             'dbus' : 'dibus',
             'dbre' : 'debre',
             'dzrta' : 'orta',
             'bdZpass' : 'bajpass',
             'stSubert' : 'Subert',
             'stSule': 'Sule',
             'stSulttts' : 'Sults',
             'stSulte' : 'Sulte',
             'stSumaker' : 'Sumaker',
             'stSults' : 'Sults',
             'stSultse' : 'Sultse',
             'stSuman' : 'Suman',
             'krger' : 'kriger',
             'tjvkezburi' : 'tiwkezberi',
             'ksmen' : 'iksmen',
             'mnkeis' : 'monkeis',
             'mnkees' : 'monkis',
             'mnkei' : 'monkei',
             'sknidsr' : 'Snaider',
             'sknabEl' : 'SnabEl',
             'skneider' : 'Snaider',
             'sknitsler' : 'Snitsler',
             'skneeklOtks' : 'Sneeklots',
             'mtSintdZrs' : 'mekintajrs',
             'skmitts' : 'Smitts',
             'skmitt' : 'Smitt',
             'kkuaua' : 'tSiwawa',
             'bmmevua' : 'biemmevu',
             'mtSkkinlei' : 'mekkinlei',
             'mtSkknigt' : 'mekknigt',
             'mtSkke' : 'mekki',
             'mtSkkenna' : 'mekkenna',
             'mtSkkagan' : 'mekkagan',
             'mtSkkentsje' : 'mekkentsje',
             'mtSkkinnon' : 'mekkinnon',
             'mtSilrOi' : 'mekilrOi',
             'mtSintoS' : 'mekintoS',
             'ksttsjbit' : 'eksibit',
             'sttsabo' : 'dZabo',
             'dlkstOrja' : 'diellecistoria',
             'gstabilita' : 'astabilita',
             'mskif' : 'miseSif',
             'stdaddZer' : 'stritdager',
             'dzbiJJiju' : 'dZibiJiev',
             'vnka' : 'wonka',
             'gdmi' : 'akkadiemmei',
             'llswelin' : 'liwelin',
             'dzdzEdd' : 'dZed',
             'bdZrne' : 'bairne',
             'dvjer' : 'dajer',
             'dvOak' : 'dwoak',
             'dvan' : 'dwan',
             'dvigt' : 'dwaigt',
             'dvains' : 'dwains',
            #  'llobregat' : 'lobregat',
            #  'llorEnte' : 'lorEnte',
             'dllEden' : 'dellEden',
            #  'llano' : 'lano',
            #  'llOid' : 'lOid',
            #  'lleiton' : 'leiton'
             'tkenji' : 'tokendZi',
             'mbour' : 'embour',
             'mkbain' : 'mekbain',
             'mkbEal' : 'mekbEal',
             'gfunk' : 'dZifunk',
             'mri' : 'emme erre i',
             'mmevuenti' : 'emmeventi',
             'dEllnksa ' : 'dEllenne tSi esse a',
             'jvuvjoooo' : 'evovo',
             'njvpOrt' : 'njuport',
             'njvman' : 'njuman',
             'njvgate' : 'njugeit',
             'njvburg' : 'njuburg',
             'gjpsi' : 'gipsi',
             'lga' : 'ellegia',
             'jjuels' : 'dZuels',
             'jjuell' : 'dZuell', 
             'jjaOttsi' : 'jaOttsi',
             'jjang' :'jang',
             'njvmarket' : 'njumarket',
             'dZjparana' : 'dZiparana',
             'dZperjon' : 'ajperjon',
             'pltsen' : 'psen',
             'rvanda' : 'rwanda',
             'prva' : 'prova',
             'kdzrda' : 'korda',
             'kdanSja' : 'kodanSa',
             'mkrae' : 'mekrae',
             'vbrake' : 'vibrejk',
             'tdZpe' : 'tajpe',
             'rdZerson' : 'rajerson',
             'sjrus' : 'sirus',
             'ndZberkssundtrizil' : 'najberkssundtrizil',
             'mjron' : 'miron',
             'bside' : 'abside',
             'ksssoffsset' : 'iksoffset',
             'mjju' : 'miu',
             'mjjako' : 'miako',
             'mjjadZi' : 'miadZi',
             'sjprus' : 'siprus',
             'ljra' : 'lira',
             'fbi' : 'efbiai',
             'mmmevuatalli' : 'muvatalli',
             'srjuzburi' : 'Srjuzburi',
             'dddZe' : 'dodZe',
             'mkbain' : 'mekbain',
             'kbab' : 'kebab',
             'jjfua' : 'ipua',
             'tminus' : 'timinus',
             'gssu' : 'su',
             'rjbakk' : 'ribakk',
             'rmotorspOrt' : 'ErmotorspOrt',
             'gwjbrus' : 'gajbruS',
             'ldZons' : 'lajons',
             'jjujitsu' : 'dZudZitsu',
             'dkelp' : 'Elp',
             'ksbOks' : 'iksbOks',
             'kjjoSSi' : 'kjoSSi',
             'fjrstenbErg' : 'firstenbErg',
             'gveda' : 'rigveda',
             'tjrrEll' : 'tirrEll',
             'tjra' : 'tjra',
             'skglesindZer' : 'SlesiNger',
             'kssenuja' : 'ksenudZa',
             'skvabintvest' : 'Svabingvest',
             'skvartseneddZer' : 'SvartseneddZer',
             'skvartsenbErg' : 'SvartsenbErg',
             'skvarman' : 'Svarman',
             'skverin' : 'Sverin',
             'skvab' : 'Svab',
             'bjron' : 'bajron',
             'ndimensjonali' : 'enne dimensjonali',
             'ndimensjonale' : 'enne dimensjonale',
             'lnezimo' : 'lennezimo',
             'dleagwe' : 'dilig',
             'stpanek' : 'stepanek',
             'dZkjad' : 'dZiad',
             'kttserolungo' : 'kappa dzero breve',
             'kttserobreve' : 'kappa dzero luNgo',
             'ktan' : 'katan',
             'mkaulei' : 'mekaulei',
             'mkfErson' : 'mekfErson',
             'trble' : 'trebol',
             'spdZem' : 'spajem',
             'ksfiles' : 'iksfiles',
             'ksfaktor' : 'iksfaktor',
             'gdinja' : 'gidinja',
             'tsssOlt' : 'tsOlt',
             'tjbreak' : 'tibrejk',
             'mstizlav' : 'mstizlav',
             'dzli' : 'oli',
             'ktsestokowa' : 'tSestokova',
             'tnkeani' : 'tonkeani',
             'tnkean' : 'tonkean',
             'bkar' : 'biar',
             'mliss' : 'emliss',
             'tjrrEll' : 'tirrEll',
             'mdZers' : 'majers',
             'flnom' : 'pnom',
             'kfukuji' : 'kofukuji',
             'dzrinski' : 'dZarinski',
             'tdZke' : 'tajk',
             'ppErelakaise' : 'pErelakaise',
             'ppEre' : 'pEre',
             'lleiton': 'leiton',
              'lluis': 'luis',
              'llOid': 'lOid',
              'llorEnte': 'lorEnte',
              'llanOs': 'lanOs',
              'llobregat': 'lobregat',
              'llano': 'lano',
              'ssukamoto': 'sukamoto',
              'ssujoSSi': 'sujoSSi',
              'sserdZo': 'serdZo',
              'ssebastjen': 'sebastjen',
              'ssukaza': 'sukaza',
              'ssjEnko': 'sjEnko',
              'ssunami': 'sunami',
              'sserje': 'serje',
              'mmevuain': 'mevuain',
              'mmevuitking': 'mevuitking',
              'mmevuaddZer': 'mevuaddZer',
              'mmevuinkle': 'mevuinkle',
              'mmevuins': 'mevuins',
              'mmevuerks': 'mevuerks',
              'mmevuanson': 'mevuanson',
              'mmevuimming': 'mevuimming',
              'mmevuin': 'mevuin',
              'mmevueet': 'mevueet',
              'mmevuente': 'mevuente',
              'mmevangkol': 'mevangkol',
              'mmevuelve': 'mevuelve',
              'mmevue': 'mevue',
              'mmevuaimir': 'mevuaimir',
              'mmevuitter': 'mevuitter',
              'mmevuedenborg': 'mevuedenborg',
              'mmevuartts': 'mevuartts',
              'mmevuista': 'mevuista',
              'mmevuitts': 'mevuitts',
              'mmevuisted': 'mevuisted',
              'mmevuim': 'mevuim',
              'mmevang': 'mevang',
              'mmevuing': 'mevuing',
              'mmevuarr': 'mevuarr',
              'mmevuattsiland': 'mevuattsiland',
              'mmevuan': 'mevuan',
              'mmevuiss': 'mevuiss',
              'mmevuate': 'mevuate',
              'mmevuifttOjOta': 'mevuifttOjOta',
              'mmevinedd': 'mevinedd',
              'mmevuentjEt': 'mevuentjEt',
              'mmevuifmmeven': 'mevuifmmeven',
              'mmevuo': 'mevuo',
              'mmevuentjone': 'mevuentjone',
              'mmeva': 'meva',
              'mmevuajili': 'mevuajili',
              'mmevuings': 'mevuings',
              'mmevuitk': 'mevuitk',
              'mmevuein': 'mevuein',
              'mmevai': 'mevai',
              'mmeveetveetvee': 'meveetveetvee',
              'mmevuittano': 'mevuittano',
              'mmevingli': 'mevingli',
              'mmevuOrd': 'mevuOrd',
              'mmevueep': 'mevueep',
              'SSiimittsu': 'Siimittsu',
              'SSiirlei': 'Siirlei',
              'SSiinano': 'Siinano',
              'SSiiddzuka': 'Siiddzuka',
              'SSiield': 'Siield',
              'SSiiniki': 'Siiniki',
              'SSiittswoka': 'Siittswoka',
              'SSiippingkompani': 'Siippingkompani',
              'SSiigure': 'Siigure',
              'SSiia': 'Siia',
              'SSiitwok': 'Siitwok',
              'mis@Sif' : 'misESif',
              'SSiin': 'Siin',
              'SSiields': 'Siields',
              'SSiirai': 'Siirai',
              'SSiipton': 'Siipton',
              'SSiip': 'Siip',
              'SSiifu': 'Siifu',
              'SSiira': 'Siira',
              'SSiinkansen': 'Siinkansen',
              'SSiining': 'Siining',
              'SSiikari': 'Siikari',
              'SSiiki': 'Siiki',
              'SSiimattsu': 'Siimattsu',
              'SSiiva': 'Siiva',
              'SSiinee': 'Siinee',
              'SSiindzon': 'Siindzon',
              'SSiiina': 'Siiina',
              'SSiibuja': 'Siibuja',
              'SSiimsaiki': 'Siimsaiki',
              'SSiire': 'Siire',
              'SSiipit': 'Siipit',
              'SSiiro': 'Siiro',
              'SSiinagava': 'Siinagava',
              'SSiir': 'Siir',
              'SSiips': 'Siips',
              'SSiif': 'Siif',
              'SSiinbun': 'Siinbun',
              'SSiindZen': 'SiindZen',
              'SSiirakavaiki': 'Siirakavaiki',
              'SSiirats': 'Siirats',
              'SSiiujuku': 'Siiujuku'
                          }

In [None]:
# calling the function

IT_train_df['phonl_tr'] = IT_train_df['phonl_tr'].apply(fix_transcriptions, args=[IT_spelling, IT_abbr, IT_others])
IT_train_df['phont_tr'] = IT_train_df['phont_tr'].apply(fix_transcriptions, args=[IT_spelling, IT_abbr, IT_others])
IT_test_df['phonl_tr'] = IT_test_df['phonl_tr'].apply(fix_transcriptions, args=[IT_spelling, IT_abbr, IT_others])
IT_test_df['phont_tr'] = IT_test_df['phont_tr'].apply(fix_transcriptions, args=[IT_spelling, IT_abbr, IT_others])
IT_val_df['phonl_tr'] = IT_val_df['phonl_tr'].apply(fix_transcriptions, args=[IT_spelling, IT_abbr, IT_others])
IT_val_df['phont_tr'] = IT_val_df['phont_tr'].apply(fix_transcriptions, args=[IT_spelling, IT_abbr, IT_others])

In [None]:
ITdataset = pd.concat([IT_train_df, IT_test_df, IT_val_df],  ignore_index=True)

ITdataset.dtypes

In [None]:
# saving fixed transcriptions
%cd /mydir

pkl_name = f"fix_itCV_df.pkl"
with open(pkl_name, 'wb') as file:
  # A new file will be created
  pkl.dump([IT_train_df, IT_test_df, IT_val_df], file)

#### Spanish

In [None]:
# # train
ES_train_df['phonl_tr'] = ES_train_df['phonl_tr'].apply(transcrCleanerMltLng, args=['es'])
ES_train_df['phont_tr'] = ES_train_df['phont_tr'].apply(transcrCleanerMltLng, args=['es'])

#test
ES_test_df['phonl_tr'] = ES_test_df['phonl_tr'].apply(transcrCleanerMltLng, args=['es'])
ES_test_df['phont_tr'] = ES_test_df['phont_tr'].apply(transcrCleanerMltLng, args=['es'])


# #val
ES_val_df['phonl_tr'] = ES_val_df['phonl_tr'].apply(transcrCleanerMltLng, args=['es'])
ES_val_df['phont_tr'] = ES_val_df['phont_tr'].apply(transcrCleanerMltLng, args=['es'])


ESacr_train = find_acronyms(ES_train_df) #195
ESacr_test = find_acronyms(ES_test_df) #55
ESacr_val = find_acronyms(ES_val_df) #37


# list of ES problematic transcriptions

ESall_probs = [el[1] for el in ESacr_test + ESacr_train + ESacr_val]
ESall_probs_flat = [w for el in ESall_probs for w in el]


# set of ES problematic transcriptions

ESall_probs_s = list(set(ESall_probs_flat))
ESall_probs_s # 63 items

In [None]:
# language-dependent vocabularies to fix the issues

ES_spelling = {'b' : 'be', 
               'B' : 'Be',
               'tS' : 'tSe',
               'ts' : 'tse', 
               'ttS' : 'tSe',
               'd' : 'de',
               'D' : 'de',
               'f' : 'efe',
               'dZ' : 'dZe',
               'dz' : 'dze',
               'g' : 'xe', 
               'G' : 'Ge',
               'h' : 'atSe', 
               'j' : 'xota',
               'k' : 'ka',
               'l' : 'ele',
               'm' : 'eme',
               'n' : 'ene',
               'p' : 'pe',
               'q' : 'ku',
               'r' : 'ere',
               's' : 'ese',
               't' : 'te',
               'T' : 'Te',
               'v' : 'uve',
               'x' : 'ekis',
               'z' : 'seta',
               'w' : 'udoBle'
               }

ES_abbr = {'mr' : 'mister',
           'tv' : 'teve',
           'dr' : 'doktor',
           'bs' : 'boliBares',
           'st' : 'seint',
           'vs' : 'versus',
           'mrs' : 'missis',
           'xr' : 'dZunior', ### VARIATION!
           'dj' : 'didZei',
           'ms' : 'miss'}

           

ES_others = {'ls' : 'las',
             'mkxee ' : 'makii',
             'fljjn' : 'flin',
             'prt' : 'part',
             'ljjn' : 'lin',
             'ljjNk' : 'link',
             'mtn' : 'mawne',
             'iron' : 'airon',
             'bjjrDs' : 'birds',
             'tx' : 'tojo',
             'kf' : 'seefe',
             'kn' : 'kaene',
             'bxrk' : 'bSork',
             'mn' : 'm',
             'rrjj' : 'rii',
             'rrjjs' : 'riis',
             'rrjjtn' : 'ritn',
             'rrjjkjj' : 'riki',
             'gf' : 'gief',
             'mjjst' : 'miist',
             'mGs' : 'emdZies',
             'ft' : 'fit',
             'pjjn' : 'pin',
             'bljjt' : 'blit',
             'kT' : 'seDe',
             'gjjpzjj' : 'dZispi',
             'xjjBskjjl' : 'xiBaskila',
             'kl' : 'sei',
             'jjs' : 'is',
             'lk' : 'ele iGual a ka' ,
             'kDx' : 'seDexe',
             'tjjk' : 'tik',
             'gjjr' : 'gior',
             'ms' : 'mas',
             'bjjrT' : 'birT',
             'jjldT' : 'dZildZis',
             'bksx' : 'vaxo',
             'tkl' : 'te se ele',
             'jjt' : 'ite',
             'tG' : 'toGo',
             'gwjjnet' : 'gwinet',
             'ntra' : 'nwestra',
             'pjjLis' : 'fillis',
             'tSteaw' : 'tSatow',
             'tSteawDks' : 'tSatow deks',
             'gnostiTizmo' : 'agnostiTizmo',
             'gnekko' : 'nekko',
             'xnjo' : 'dZanjo',
             'TwaTwa' : 'suasua',
             'mkpee' : 'mekpi',
             'wjjman' : 'wiman',
             'ljjl' : 'lajl',
             'stSsistSe' : 'SaSeSwetS',
             'mburukujja' : 'burukujja',
             'mGo' : 'maGo',
             'mNgk' : 'moNke',
             'rrtjjpe' : 'erretaip',
             'tSrjjzler' : 'krizler',
             'bstra' : 'bastra',
             'gtaland' : 'gotaland',
             'kfir' : 'kafir',
             'djjnasti' : 'dinasti',
             'djjson' : 'dajson',
             'djjnamo' : 'dinamo',
             'djjke' : 'dik',
             'djjer' : 'dajer',
             'djjlan' : 'dilan',
             'kljjDe' : 'klajDe',
             'fljjiNg' : 'flaiNg',
             'bjjtekoDe' : 'baitekoDe',
             'bjjtes' : 'baites',
             'bjjnun' : 'binun',
             'bjje' : 'bai',
             'bjjrne' : 'birne',
             'bjjpas' : 'bajpas',
             'bjjron' : 'bajron',
             'bjjuNgun' : 'biuNgun',
             'ljjman' : 'liman',
             'ljjme' : 'lime',
             'kBote' : 'kote',
             'kBe' : 'koBe',
             'mkBriDe' : 'makBriDe',
             'kBa' : 'kuBa',
             'mkkinon' : 'makkinon', 
             'mkkinsej': 'makkinsei',
             'mkkorT' : 'makkorT',
             'mkkeLar' :  'makkeLar', 
             'mkkaGan': 'mekkaGan',
             'mkklajn' : 'makklajn',
             'mkkuneorejstSawer' : 'makkuneorejSawer',
             'mkkartnej' : 'mekkartnej',
             'mkkan' : 'makkan',
             'mkkaj': 'makkaj',
             'mkkoj' : 'makkoj',
             'mkkuLotS' : 'makkuLotS',
             'mkkonawGej' : 'makkonawGej', 
             'mkkuLowG' : 'makkuLowG', 
             'mkkormikk' : 'makkormikk', 
             'mkkandles' : 'makkandles',
             'mkkarti' : 'makkarti',
             'rrjjan' : 'rrajan', 
             'rrjju' : 'rriu', 
             'rrjjo' : 'rrio', 
             'rrjjoko' : 'rrooko', 
             'rrjjwitSi' : 'rriuitSi', 
             'rrjjDBerx': 'rruiDBerx',
             'rrjjle' : 'rrile' ,
              'rrjjoej' : 'rrioxej',
              'rrjjukjju' : 'rriukju',
              'rrjjoNgae' : 'rrioNgae', 
             'rrjjota' : 'rriota', 
             'rrjjuxi' : 'rriuxi',
             'ljjle' : 'lile', 
             'jjlaNgjjlaNg' : 'ilaNgilaNg', 
             'kjjla' : 'kila', 
             'tjjler' : 'tajler', 
             'pjjle' : 'pile',
             'kjjlje' : 'kailje',
             'kjjle' : 'kaile', 
             'mjjles' : 'miles',
             'krjjloB' : 'kriloB',
             'mjjkoplazma': 'mikroplazma',
             'mjjka' : 'mika',
             'dformes' : 'deformes',
             'pjjro' : 'piro',
             'pjjsiks' : 'fisiks',
             'pjjoNgjjaNg' : 'pioNgiaNg',
             'pjjton' : 'piton',
             'pjjsikal' : 'fisikal',
             'JjjkpiNg' : 'JikopiNg',
             'mkaDoo' : 'makaDoo',
             'mklaren' : 'maklaren',
             'mkDonalds' : 'makDonalds',
             'mknamara' : 'maknamara',
             'mkBeal' : 'makBeal',
             'mkmurDo' : 'makmurDo',
             'mkGoBern' : 'makGoBern',
             'mkmaon' : 'makmaon',
             'mkDoneL' : 'makDoneL',
             'mkGee' : 'makGee',
             'mklowGlin' : 'maklowGlin',
             'mkBriDe' : 'makBriDe',
             'mkDermot' : 'makDermot',
             'mkGreGor' : 'makGreGor',
             'mkrae'  : 'makrae',
             'mkmanus' : 'makmanus',
             'mkDowGaL' : 'makDowGaL',
             'mklean' : 'maklean',
             'mkDonald' : 'makDonald',
             'mkpee' : 'makpi',
             'mkxee' : 'magii',
             'mkaLen' : 'makaLen',
             'djjke' : 'dike',
             'kjjril' : 'siril',
             'tSwa' : 'tSua',
             'tSwan' : 'tSuan',
             'kjjprus' : 'siprus',
             'kjjBorx' : 'sajBor',
             'kjjkliNg' : 'sikliNg',
             'kjjmru' : 'simru',
             'kjjklokros' : 'siklokros',
             'kjjklone' : 'siklone',
             'kjjuSu' : 'kjuSu',
             'kjjoko' : 'kioko',
             'kjjBertron' : 'siBertron',
             'kjjGnus' : 'siGnus',
             'dpto' : 'departamento',
             'brjjTe' : 'briTe',
             'fxorDane' : 'fjorDein',
             'ktSi' : 'kotSi',
             'Jjjmburk' : 'Jimburk',
             'jjriGojjen' : 'iriGojen',
             'mjjrtle' : 'mirtle',
             'gjjrGi' : 'gjorgi',
             'ljjra' : 'lira',
             'tjjrjon' : 'tirjon',
             'kjjril' : 'kiril',
             'mjjrmeTiTa' : 'mirmeTiTa',
             'nrro' : 'numero',
             'mNkeNglaDBatS' : 'moNxeNglaBax',
             'jjNgxik' : 'joNgSi',
             'jjpres' : 'ipres',
             'jjko' : 'iko',
             'tjjkoon' : 'tikun',
             'mjjka' : 'mika',
             'rtel' : 'xertel',
             'gDriBe' : 'dZiDraiBe',
             'bSork' : 'bSork',
             'pTja' : 'provinsia',
             'kTerni' : 'serni',
             'krjjTek' : 'krisek',
             'mnet' : 'emnet',
             'brjjant' : 'brajant',
             'brjjTe' : 'briTe',
             'rrjjle' : 'rrile',
             'bxrkman' : 'borkman',
             'lselektina' : 'elselektina',
             'wwwsfGatekon' : 'dobleve dobleve dobleve punto ese efe geit punto kom',
             'krjjstal' : 'kristal',
             'tBjjnoBelas' : 'teve i noBelas',
             'mxolnir' : 'mjolnir',
             'krjjtek' : 'kritek',
             'dnis' : 'denis',
             'ljjDja' : 'liDja',
             'tkon' : 'tekom',
             'lnder' : 'linder',
             'jjutaka' : 'iutaka',
              'jjojjo' : 'iojo',
              'jjuNg' : 'iuNg',
              'jji' : 'ji',
              'jjat' : 'iat', 
             'jjiNg' : 'jiNg',
             'jjaropolk' : 'iaropolk', 
             'jjunta' : 'iunta',
             'jjukari' : 'iukari', 
             'jjari' : 'iari',
             'jjoSiiro' : 'ioSiiro', 
             'jjarakuj' : 'iarakuj', 
             'jjaGwe' : 'iaGwe', 
             'jjeLowxakket' : 'jeLowxakket', 
             'jjon' : 'ion', 
             'jjamaSita' : 'jamaSita', 
             'jjelos' : 'ielos', 
             'jjwiNgs' : 'dZiwiNgs',
             'jjamawtSi' : 'iamawtSi',
             'jjerma' : 'ierma', 
             'jjuka' : 'iuka',
             'jjarozlaB' : 'iarozlaB',
             'jjankee' : 'janki', 
             'jjel' : 'iel', 
             'jjaTen' : 'jaTen', 
             'jjates' : 'jates',
             'jjuko' : 'juko', 
             'jjaGo' : 'jaGo', 
             'jjap' : 'jap', 
             'jjowtuBe' : 'jowtuBe',
             'jjamuna' : 'jamuna', 
             'jjoSijjuki' : 'joSijuki', 
             'jjupaNki' : 'jupaNki', 
             'jjonkers' : 'jonkers', 
             'jjori' : 'jori',
             'jjerro' : 'jerro', 
             'jjamaa' : 'jamaa', 
             'jjusej' : 'jusei', 
             'jjaTimjento' : 'jaTimjento', 
             'jjaTe' : 'jaTe',
             'jjman' : 'iman',
             'jjaTiDies' : 'jaTiDies', 
             'jjandel' : 'jandel', 
             'jjuDiStira' : 'juDiStira',
             'jjokasta' : 'jokasta',
             'jjunsoo' : 'junsoo', 
             'jjoo' : 'joo',
             'jjamane' : 'jamane',
             'jjukje' : 'jukje', 
             'jjukje' : 'jukje', 
             'jjeBxeni' : 'jeBxeni', 
             'jjuGozlaBo' : 'juGozlaBo',
             'jjasuSi' : 'jasuSi',
             'jjear' : 'iar', 
             'jje' : 'je', 
             'jjera' : 'jera', 
             'jjerra' : 'jerra', 
             'jjarozlaBl' : 'jarozlaBl',
             'jjorkk' : 'jorkk',
             'jjaBe' : 'jaBe',
             'jjale' : 'jale',
             'jjuGozlaBa' : 'juGozlaBa', 
             'jjes' : 'jes', 
             'jjeso' : 'jeso', 
             'jjo' : 'jo', 
             'jja' : 'ja',
             'jjowrself' : 'iowrself', 
             'jjazmine' : 'jazmine',
             'jjamatanoorotSi' : 'jamatanorotSi',
             'jjBete' : 'jBete', 
             'jjeLow' : 'jeLow', 
             'jjBriT' : 'jBriT', 
             'jjolanda' : 'jolanda', 
             'jjakoBleB' : 'jakoBleB', 
             'jjoSimitsu' : 'joSimitsu', 
             'jjajja' : 'jajja', 
             'jjuDo' : 'juDo',
             'jjenes' : 'jenes', 
             'jjani' : 'jani',
             'jjaNki' : 'jaNki', 
             'jjakarta' : 'dZakarta', 
             'jjenas' : 'jenas',
             'jjandeks' : 'jandeks', 
             'jjwan' : 'iwan', 
             'jjena' : 'jena', 
             'jjamaTaki' : 'jamaTaki', 
             'jjekaterina' : 'jekaterina',
             'jjiaT' : 'jiaT', 
             'jjoGi' : 'joGi', 
             'jjelo' : 'ielo',
             'jju' : 'iu', 
             'jjasukuni' : 'jasukuni',
             'jjukatan' : 'jukatan',
             'jjonrrog' : 'jonrrog', 
             'jjanes': 'janes', 
             'jjemeni' : 'jemeni',
             'jjkosan': 'ikosan',
             'jjeoNgu' : 'jeoNgu',
             'jjeats' : 'jeats',
             'jjuNxin': 'juNxin', 
             'jjajTa' : 'jajTa', 
             'jjuki' : 'juki',
             'jjelmo' : 'jelmo',
             'jjeBra': 'jeBra', 
             'jjoruBa' : 'joruBa',
             'jjuGozlaBja' : 'juGozlaBja', 
             'jjarTa' : 'jarTa', 
             'jjow' : 'jow', 
             'jjorkSire' : 'jorkSire', 
             'jjerBas' : 'jerBas',
             'jjowtuBers' : 'jowtuBers',
             'jjone' : 'jone', 
             'jjurijjeBpolski' : 'jurijjeBpolski', 
             'jjaGisijjan' : 'jaGisijjan', 
             'jjpe' : 'ipe', 
             'jjuTni' : 'juTni', 
             'jjuste' : 'juste',
             'jjamato' : 'jamato', 
             'jjoSiki' : 'joSiki', 
             'jjawri' : 'jawri',
             'jjeti' : 'jeti',
             'jjuGozlaBos' : 'juGozlaBos', 
             'jjaGami' : 'jaGami', 
             'jjerGe' : 'jerGe', 
             'jjaTimjentos' : 'dZaTimjentos', 
             'jjunan' : 'junan',
             'jjankees' : 'jankees',
             'jjork':'jork', 
             'jjak': 'jak',
             'jjamila' : 'jamila', 
             'jjet' : 'jet',
             'jjeserias' : 'jeserias',
             'jjenin' : 'jenin',
             'jjorker' : 'jorker', 
             'jjunus' : 'junus',
             'jjajjo' : 'jajjo', 
             'jjere' : 'jjre', 
             'jjowNger':'jowNger',
             'jjoSiDa' : 'joSiDa',
             'jjaser' : 'jaser', 
             'jjaoo' : 'jau', 
             'jjeas' : 'jeas', 
             'jjusuke' : 'jusuke',
             'jjowNg' : 'jowNg', 
             'jjuGo' : 'juGo', 
             'jjamatoe' : 'jamatoe',
             'jjema' : 'jema', 
             'jjuw' : 'juw', 
             'jjeGwas' : 'jeGwas', 
             'jjea' : 'jea', 
             'jjoGur' : 'joGur',
             'jjermos': 'jermos',
             'jjakuTa' : 'jakuTa',
             'jjamaDa' : 'jamaDa',
             'jjBone' : 'iBone', 
             'jjao' : 'jao',
             'jjeroJjjmus' : 'jeroJjjmus',
             'jjamal' : 'jamal', 
             'jjee' : 'jee', 
             'jjstejn' : 'istejn',
             'jjowt' : 'jowt', 
             'jjpres' : 'ipres', 
             'jjerBen' : 'jerBen', 
             'jjuNgaj' : 'juNgaj', 
             'jjerBe' : 'jerBe',
             'jjwi' : 'iwi',
             'jjukoaTteka' :'jukoaTteka',
             'jjoGa' : 'joGa', 
             'jjriGojjen' : 'iriGojjen', 
             'jjukimura' :'jukimura', 
             'jjuna' : 'juna',
             'jjakis' : 'jakis', 
             'jjuNxoNg' : 'juNxoNg',
             'jjoDo' : 'joDo',
             'jjamaGutSi' : 'jamaGutSi', 
             'jjuxjo' : 'juxjo', 
             'jjuma' : 'juma', 
             'jjorDan' : 'jorDan', 
             'jjamena' : 'jamena', 
             'jjan' : 'jan',
             'jjDro': 'iDro', 
             'jjepes' : 'jepes', 
             'jjazmin' : 'dZazmin', 
             'jjuri' : 'juri', 
             'jjDe': 'iDe', 
             'jjoko' : 'joko',
             'jjamaSiro': 'jamaSiro', 
             'jjoma' : 'joma', 
             'jjNgxik' : 'iNgxik', 
             'jjaNgtSe' : 'jaNgtSe', 
             'jjarT' : 'jarT', 
             'jjeoBil' : 'jeoBil', 
             'jjemas': 'jemas', 
             'jjiSuB' : 'iSuB', 
             'jjomo' : 'jomo', 
             'jjanan' : 'janan', 
             'jjule' : 'jule', 
             'jjuriDja' : 'juriDja',
             'jjonson' : 'dZonson',
             'jjokoama' : 'jokoama',
             'jjukatekas' : 'jukatekas',
             'jjiButi': 'iButi', 
             'jjeoNxeGu' : 'jeoNxeGu',
             'jjendo' : 'jendo',
             'jjerBa' : 'jerBa', 
             'jjoNgle' : 'joNgle', 
             'jjokoTuna' : 'jokoTuna', 
             'jjate' : 'jate', 
             'jjoDuro':'joDuro', 
             'jjowr' : 'jowr',
             'ltDa' : 'eletedea',
             'dBarBara': 'BarBara',
             'dBin' : 'deBin',
             'dBina' : 'deBina',
             'dBoak' : 'deBoak',
             'Llamaron' : 'Lamaron',
             'mkGoBern' : 'mekGoBern',
             'krTjjsTtof' : 'krisTtof',
             'gsta' : 'gosta',
             'gstaaT' : 'gostaaT',
             'lkDo' : 'elkaDo',
             'TBiGnjew' : 'zpikNw',
             'mjjsterjon' : 'misterjon',
              'mjjanmar' : 'mianmar', 
             'mjjsterjous' : 'misterjous',
             'mjjsterjo' : 'misterjo', 
             'mjjuNgwun' : 'mjuNgwun', 
             'mjjers' : 'miers', 
             'mjjseartSkomaw' : 'miseartSkomaw', 
             'mjjsteri' : 'misteri',
             'mjjstik' : 'mistik', 
             'mjjstike' : 'mistike',
             'wjjne' : 'wine',
             'rrjjDBerx' : 'rriDBerx',
             'rTte' : 'artSe',
             'mlnik' : 'melnik',
             'dto' : 'deteo',
             'pBro' : 'prezbitero',
             'brtjanu' : 'bratjanu',
             'Ngema' :'Nengema',
             'ljjndon' : 'lindon',
             'JjjBorx' : 'sajBorx',
             'xmin' : 'eksmin',
             'kjjGnus' : 'signus',
             'wjjomiNg' : 'wjomiNg',
             'xrok' : 'xotarok',
             'xruStSoB' : 'kruStSoB',
             'mtlej' : 'motlej',
             'wrokaw' : 'Brokav',
             'wriGt' : 'rait',
             'wrajt' : 'rajt',
             'wren' : 'ren',
             'wwweleNgaNkees' : 'doble Be doble Be doble Be punto eleNgaNtSe punto es',
             'wwwfreesoftorx' : 'udoble udoble udoble frisoftorks',
             'wwwTurGajkon' : 'doble Be doble Be doble Be punto surGaj punto kom',
             'tjjsen' : 'tisen',
             'tjjsembornemisTa' : 'tisembornemisTa',
             'mknamara' : 'maknamara',
             'Sxi' : 'Si',
             'Sxo' : 'So',
             'rrmoT' : 'erremoD',
             'rrtrut' : 'ertrut',
             'kta' : 'kota',
             'brseNkowrjer' : 'borseNkowrjer',
             'xnkp' : 'dZonkopinG',
             'Nxal' : 'Nial',
             'xxer' : 'xer',
             'rrTa' : 'erreTea',
             'jjDro' : 'iDro',
             'ljjon' : 'ljon',
             'ljjonsajntpawl' : 'ljonsajntpawl',
             'ljjeL' : 'ljeL',
             'ljjNkBurx' : 'ljNkBurx',
             'TjjGmunt': 'TiGmunt',
             'Jjjon' : 'Jjon',
             'Jjjman' : 'Jjman',
             'Jjje' : 'Jje', 
             'JjjBorx' : 'JjBorx', 
             'tSristoper' : 'kristoper', 
             'tSrome' : 'krome',
             'tSristos' : 'kristos', 
             'tSrist' : 'krist', 
             'tSristjan' : 'kristjan',
             'tSrisje' : 'krisje', 
             'tSronikles' : 'kronikles',
             'tSristofer' : 'kristofer', 
             'tSronikle' : 'kronikle', 
             'tSristjane' : 'kristjane',
             'tSrjjzler' : 'krizler', 
             'tSristmas' : 'kristmas', 
             'tSristje' : 'kristje', 
             'tSristina' : 'kristina', 
             'tSrister' : 'krister', 
             'tSristensen' : 'kristensen', 
             'tSris' : 'kris', 
             'tSristine' : 'kristine', 
             'tSromjun' : 'kromjun',
             'tSristop' : 'kristop', 
             'tSristi' : 'kristi',
             'Sperja' : 'eSperja',
             'TrDaf' : 'serDaf',
             'tjjBurn' : 'tiBurn',
             'dJa' : 'deJa',
             'btamer' : 'botamer',
             'kjjlj' : 'kilj',
             'dwjj' : 'dwier',
             'gman' : 'geman',
             'gmina' : 'gemina',
             'gjjmnasjun' : 'gimnasjun',
             'mLe' : 'madmosel',
             'tmoBile' : 'temoBile',
             'kmara' : 'kamara',
             'kmart' : 'komart',
             'kmer' : 'komer',
             'tjjne' : 'tine',
             'tSlons' : 'klons',
             'tSloe' : 'kloe',
             'tBer' : 'teBer',
             'tBjjnoBelas' : 'teBinoBelas',
             'xnkpiNg' : 'dZonkopiN',
             'gjjeoNgsaNg' : 'gjeoNgsaNg',
             'lxuBlxana' : 'ljuBljana',
             'xpop' : 'keipop',
             'krjjpton' : 'kripton', 
             'krjjptonjanos' : 'kriptonjanos', 
             'krjjpto': 'kripto',
             'mtro' : 'metro',
             'lnder' : 'linder',
             'lnai' : 'lonai',
             'sjjtSeDelik' : 'sitSeDelik',
             'sjjtSo' : 'sitSo',
             'dxaNgo' : 'dZaNgo'
             
             } 

In [None]:
ES_train_df['phonl_tr'] = ES_train_df['phonl_tr'].apply(fix_transcriptions, args=[ES_spelling, ES_abbr, ES_others])
ES_train_df['phont_tr'] = ES_train_df['phont_tr'].apply(fix_transcriptions, args=[ES_spelling, ES_abbr, ES_others])
ES_test_df['phonl_tr'] = ES_test_df['phonl_tr'].apply(fix_transcriptions, args=[ES_spelling, ES_abbr, ES_others])
ES_test_df['phont_tr'] = ES_test_df['phont_tr'].apply(fix_transcriptions, args=[ES_spelling, ES_abbr, ES_others])
ES_val_df['phonl_tr'] = ES_val_df['phonl_tr'].apply(fix_transcriptions, args=[ES_spelling, ES_abbr, ES_others])
ES_val_df['phont_tr'] = ES_val_df['phont_tr'].apply(fix_transcriptions, args=[ES_spelling, ES_abbr, ES_others])

In [None]:
# testing - lists should be empty

ESacr_train = find_acronyms(ES_train_df) #195
ESacr_test = find_acronyms(ES_test_df) #55
ESacr_val = find_acronyms(ES_val_df) #37


print(ESacr_train)
print(ESacr_test)
print(ESacr_val)

In [None]:
fixed spanish corpus

pkl_name = f"fix_esCV_df.pkl"
with open(pkl_name, 'wb') as file:
  # A new file will be created
  pkl.dump([ES_train_df, ES_test_df, ES_val_df], file)

#### French

In [None]:
# train
FR_train_df['phonl_tr'] = FR_train_df['phonl_tr'].apply(transcrCleanerMltLng, args=['fr'])
FR_train_df['phont_tr'] = FR_train_df['phont_tr'].apply(transcrCleanerMltLng, args=['fr'])

#test
FR_test_df['phonl_tr'] = FR_test_df['phonl_tr'].apply(transcrCleanerMltLng, args=['fr'])
FR_test_df['phont_tr'] = FR_test_df['phont_tr'].apply(transcrCleanerMltLng, args=['fr'])

#val
FR_val_df['phonl_tr'] = FR_val_df['phonl_tr'].apply(transcrCleanerMltLng, args=['fr'])
FR_val_df['phont_tr'] = FR_val_df['phont_tr'].apply(transcrCleanerMltLng, args=['fr'])

In [None]:
FRacr_train = find_acronyms(FR_train_df) #280
FRacr_test = find_acronyms(FR_test_df) #75
FRacr_val = find_acronyms(FR_val_df) #81

In [None]:
# list of FR problematic trancsriptions

FRall_probs = [el[1] for el in FRacr_test + FRacr_train + FRacr_val]
FRall_probs_flat = [w for el in FRall_probs for w in el]


# set of FR problematic transcriptions

FRall_probs_s = list(set(FRall_probs_flat))
# FRall_probs_s # 144 items

In [None]:
# language-dependent vocabularies to fix the issues

FR_spelling = {'b' : 'be', 
               'B' : 'Be',
               'tS' : 'tSe',
               'ts' : 'tse', 
               'ttS' : 'tSe',
               'd' : 'de',
               'h' : 'aS',
               'f' : 'ef',
               'dZ' : 'dZe',
               'Z' : 'Zi',
               'S' : 'eS',
               'dz' : 'dze',
               'g' : 'Ze', 
               'j' : 'Zi',
               'k' : 'ka',
               'l' : 'el',
               'm' : 'em',
               'n' : 'en',
               'p' : 'pe',
               'q' : 'ku',
               'R' : 'ER',
               's' : 'es',
               't' : 'te',            
               'v' : 've',
               'w' : 'dubleve',
               'x' : 'iks',
               'y' : 'igRek',
               'z' : 'zed'}

FR_abbr = {'mr' : 'm@sj2',
           'ZR' : 'dZunior',
           'dr' : 'dOktOr',
           'vs' : 'vese',
           'st' : 'se~',
           'pp' : 'peaSpe',
           'sf' : 'seef',
           'gR' : 'gRam',
           'msr': 'mo~sEJ9R',
           'lt' : 'lj9t@na~',
           'gd' : 'gra~',
           'fR' :'fRa~',
           'ps' : 'pese',
           'kg': 'kilogRam',
           'mt' : 'mo~',
           'sgt' : 'sERZ@',
           'mtR' : 'metR'}

           

FR_others = {'kt' : 'kE',
             'klk ' : 'kelke',
             'gf' : 'gof',
             'lm' : 'aSelem',
             'kRt' : 'kres',
             'vksk ' : 'v2ty',
             'lRm' : 'lerm',
             'pR' : 'prE', 
             'kRn' : 'kRaon',
             'pst ' : 'aSpeeste',
             'snsd ' : 'seensedeaS',
             'SJ' : 'SEJ@',
             'bl' : 'bol@',
             'tR' : 'tRu',
             'kgpm' : 'seZepeem',
             'dsS' : 'd2tS',
             'fg' : 'fu',
             'kw' : 'kawen',
             'blk' : 'blOS',
             'pksk' : 'p2ty',
             'vlks' : 'volks',
             'gfR' : 'gofR',
             'ks' : 'iks',
             'ft' : 'fest',
             'njk' : 'nik',
             'Zz' : 'Zani',
             'pk' : 'pek',
             'lw' : 'lo' ,
             'dw' : 'dO',
             'kld' : 'klaud',
             'cefa' : 'sefa',
             'cjedei' : 'sjedei',
             'bZZR' : 'bjoRn',
             'dZs' : 'deZees',
             'tR' : 'tRent',
             'stRw' : 'stRon',
             'tm' : 'tEm',
             'kn' : 'iksnee',
             'dkst ' : 'duksy',
             'sfb' : 'seefbe',
             'gmb' : 'ZeembeaS',
             'Rv' : 'R@v@',
             'tt' : 'tEt@',
             'sk' : 'sek',
             'sdg' : 'sedeZe',
             'Sw' : 'So~',
             'bRdt' : 'bRo~nt',
             'fw' : 'fo~n',
             'gRv' : 'gr@v@',
             'mm' : 'mEm@',
             'wm' : 'woRms',
             'nn' : 'nu',
             'zn':'zane',
             'tng':'to~',
             'mlv' : 'mel@v',
             'dpk' : 'dEpEs@',
             'lbwE9~' : 'lOb@',
             'dRnEk' : 'dREnEk',
             'spjktOR' : 'spektOR',
             'gRdplas' : 'gRa~dplas',
             'bRnif' : 'bRanif',
             'bRna~' : 'bRenan',
             'bRnERtR' : 'bRynERtRot',
             'tblEz' : 'se~blEz@',
             'gRdpRe' : 'gR9~dpRe',
             'gRdpiER' : 'gRa~dpiER',
             'gRdpREsiJi' : 'gR9~dpREsiJi',
             'gRdpER' : 'gRa~dpER',
             'gRdp9R' : 'gR9~dp9R',
             'gRdpaRadi' : 'gRa~dpaRadi',
             'gRdpRi' : 'gR9~dpRi',
             'dmoSygdo~gR2b': 'demSygdo~gR2b',
             'dmoRaliz' : 'demoRaliz',
             'dmHe~Ek' : 'demHe~Ek',
             'mkRiEZ' : 'mEkRiEZ',
             'RzitR' : 'R@tR',
             'tRzik' : 'tRik',
             'gmina' : 'gimina',
             'gmini' : 'gimini',
             'pRzityli' : 'pRSityli',
             'gRabouo' : 'gRabovo',
             'gRaZiwo' : 'kRajewo',
             'Radzilo~e' : 'RatSilov',
             'ekzykze~' : 'tSutSin',
             'gdR9~' : 'gydR9~',
             'dksEvR' : 'd9sEvR',
             'dksia~se~ka~t' : 'dysa~se~ka~t',
             'dkste' : 'd9kstje',
             'dksia~' : 'd9ksja~',
             'klna~bERg' : 'klejna~bERg',
             'sZaak' : 'sjaak',
             'mtOn' : 'metOn',
             'mti' : 'meti',
             'mtE' : 'metE',
             'mtskle' : 'motkle',
             'gfRe' : 'gofRe',
             'fnap' : 'efenape',
             'msz@RES' : 'medz@RES',
             'szoZuR' : 'sa~ZuR',
             'sziZ@ti' : 'zeg@ti',
             'szymik' : 'symik',
             'sztd2' : 'sa~td2',
             'szy' : 'sy',
             'dnag' : 'denaZe',
             'dnwaje' : "denwaje",
             'dna~toSi' : 'dena~toSi',
             'dnipRop@tROvsk' : 'dnipRop@tROvsk',
             'dniat' : 'deniat',
             'Znlui' : 'Za~lwi',
             'RZuisE' : 'ReZwisE',
             'njwpOR' : 'njupORt',
             'Zsom' : 'esom',
             'pRsizjo~' : 'pRa~sizjo~',
             'Rsi' : 'R@si',
             'RpRima~d' : 'R@pRima~d',
             'bRge' : 'bRogeR',
             'kSuma~' : 'ESuma~',
             'kSaRp' : 'ESaRp',
             'pkS@Ri':'pESRi',
             'kSEl' : 'ESEl',
             'kSa~' : 'k2Sinsa~',
             'dkSaRZ' : 'dESaRZ',
             'nt9~dip' : 'nyt9~dip',
             'nseh' : 'ense',
             'tvERski' : 'tSvERski',
             'tva' : 'tevea',
             'tviJod' : 'diviJo~',
             'gRdfRER' : 'gR9~dfRER',
             'dmHe~Ek' : 'dEmHe~Ek',
             'RuvRoi' : 'RuvRwa',
             'blse~Em' : 'bolzenaim',
             'dtaj' : 'd@taj',
             'dt@ate' : 'det@ate',
             'dtuRn' : 'detuRne',
             'RvERi' : 'REvERi',
             'kRkovnika' : 'kR@ kRov nika',
             'gRdgijom' : 'gR9~dgijom',
             'dgaZe' : 'degaZe',
             'dgRada~' : 'degRada~',
             'SJ2' : 'SEJ2',
             'ZZe' : 'gegeR',
             'ZZOn' : 'dZOn',
             'ZZo~' : 'dZo~',
             'mkZij' : 'mekgil',
             'mkZin' : 'makgin',
             'sksdmokORsE'  : 'skEdmokORsEt',
             'skZEtE' : 'sZEtE',
             'gfRe' : 'gofRe',
             'gfE' : 'Sfe',
             'ndoRo~boli' : ' endoRoemboli',
             'ndiEj' : 'ndiEj',
             'ndenu' : 'endenu',
             'ndRiana' : 'EndRiana',
             'dliag' : 'delig',
             'dle~dis' : 'aldis',
             'wt9~' : 'otin',
             'jRviN' : 'iRviN',
             'jRvin' : 'iRvin',
             'jRo~g' : 'jERong',
             'mjRwo2o' : 'mirvo',
             'gdinia' : 'gdinja',
             'gd9~' : 'gyda~',
             'klmso~' : 'klemso~',
             'sktRua~pf' : 'StRumpf',
             'vlRi' : 'v9l@Ri',
             'stZa~' : 'se~Za~',
             'bORdkse~stZa~' : 'bORdokse~stZa~',
             'ksStREl' : 'keStREl',
             'swv9~k' : 'so~ntvik',
             'fltSe' : 'flEtSe',
             'lta~baseERtsEl' : 'l9ta~baXtsEl',
             'ltSimi' : 'letSimi',
             'RwgRa~' : 'RwagRa~',
             'jld@S@Em' : 'ild@S@Em',
             'zkRokaEli' : 'aizkRaukleR',
             'gZalaRORn' : 'dZalaROR',
             'gZo' : 'gyZo',
             'ngZi' : 'engugi',
             'gZa~mEstRa' : 'gyZa~mEstRa',
             'gZe~Em': 'gugenaim',
             'kRHiz@ko~tROl' : 'kRuzko~tROl',
             'fpkodnbRouse' : 'peaSpekodbRuze',
             'Rplika' : 'Replika',
             'pskOv' : 'pskOv',
             'tSjkOv' : 'tSikOv',
             'tSjkova' : 'tSikova',
             'bks@R' : 'byksjeR',
             'bks@Z@R' : 'bofR@R@',
             'bky' : 'beky',
             'slno' : 's2lno',
             'slnRi' : 's2lnRi',
             'bmna' : 'beenema',
             'mbuvm9~' : 'embuvma~', 
             'mbRiER' : 'bR9iER', 
             'mbEj' : 'embE',       
             'mby' : 'embE2', 
             'mbOzomufu' : 'mybOzomufu', 
             'mbi' : 'mind', 
             'mbua~ba' :'mbumba', 
             'mbOzofulb' : 'mebOzofulb@', 
             'mbuiuZu' : 'mbuiuZu', 
             'mbatabyby' : 'embatabu',
             'ZzdiN' : 'janedig',
             'Zza~E' : 'ZanE',
             'mkle~' : 'meklein' ,
             'nsi@Ryzwij' : 'njed@Rzuwil',
             'nsi@RERg@Em' : 'nid@RaeRgEm',
             'nsiERmORsoSwiR' : 'nidERmORSwiR',
             'nsEdaR' : 'niEdaR',
             'Zdo' : 'Zedo~',
             'gsle~gem@RinN' : 'geslingem@Rin',
             'gRdpRe' : 'gRa~dpRe',
             'mZa' : 'muntSak',
             'Rn@k9~' : 'Ryn@k9~',
             'Rna~ka~f' : 'Ryna~ka~f',
             'fdsoSa~ko' : 'fedtSa~ko',
             'bgOR' : 'bjORn',
             'bgEja' : 'bEgEja',
             'lgRma~' :  'leZeRma~',
             'tsle' : 'EtsleR',
             'RvRa~' : 'R@vRa~',
             'tpa' : 'pa',
             'sS@Em' : 'sySEm',
             'ksSi' : 'kekSi',
             'sSon@bEk' : 'Son@bEk',
             'kbi' : 'kybits',
             'tHSya' : 'tHeSa',
             'pdy' : 'pedey',
             'jvvSEn' : 'jevEn',
             'm@lnika' : 'm@lniks',
             'Rvolysjo~' : 'REvolysjo~',
             'Rvlasjo~' : 'REvlasjo~',
             'RvE' : 'REvE',
             'gRtSEn' : 'gRetSEn',
             'Rty' : 'eRtey',
             'vRtZgba' : 'vERtojba',
             'RtaS@te' : 'RtaS@te',
             'Rte' : 'R9teR',
             'wwolEbdodyva~dR@dikOm' : 'dubleve dubleve dubleve pwa~ lEbdodyva~dR@di pwa~ kOm',
             'Swwal' : 'S9wal',
             'wwyoS' : 'weitbRyS',
             'Rgyz' : 'Regyse~',
             'bRge' : 'boRgeR',
             'mRw@t2@e' : 'm@sj2 w@teR',
             'vga~' : "woga~n",
             'kRme' : 'kR9me',
             'svRma~' : 'sevRma~',
             'gRlonz' : "gRelen",
             'pRmiOm' : 'pREmiOm',
             'pRvie~dREtil' : 'pRevie~dREtil',
             'pRvot' : 'pRevo',
             'ZnR@ne' : 'Zo~R@ne',
             'ZnR@no' : 'Zo~nR@no',
             'tkRiva~' : 't9kRiva~',
             'ngliZama~' : 'negliZama~',
             'Stkyti' : 'tSEkyti',
             'gRdZa~' : 'gR9dZa~',
             'Swwal' : 'Sawal',
             'bRlEks' : 'bRo~',
             'kfa' : 'cefa',
             'kf@e' : 'kf@R',
             'mjdwa~o' : 'midwest',
             'tzz@a~dikobORda' : "ekstzeandikoboRda",
             'Zma~liN' : 'gemaling',
             'dZminamigoSi' : 'dodZominamigotSi',
             'lgon' : 'l9gons',
             'kstytEti' : 'kestytEkis',
             'dkste' : 'd2ktSjeR',
             'kstRayy' : 'ksaRaky',
             'vkskRme' : 'v9kSoRme',
             'mpRomE' : 'pRomE',
             'mpala' : 'empala',
             'pkS@Ri' : 'pES@Ri',
             'sjkigaaRa' : 'sikigaaRa',
             'kRwknORma~' : 'kRwanORma~',
             'sRmoni' : 'tseRmoni',
             'sRi2z' : 'seRi2z',
             'sRinivaza~' : 'seRinivaza~',
             'sRmski' : 'sRemski',
             'sR@botnik' : 'zR@botnik',
             'sRi2' : 'seRi2',
             'sRi2zma~' : 'seRi2zma~',
             'mdogo' : 'emdogo',
             'md@s9~' : 'med@s9~',
             'wma~' : 'uma~',
             'stmoRis' : 'se~moRis',
             'fRmis' : 'fRemis',
             'SSEko' : 'SEko',
             'dkSaRZ' : 'deSaRZe',
             'sksdmokORsE' : 'sk2dmokORsE',
             'bne' : 'bnej',
             'Rpta' : 'Repeta',
             'RptE' : 'RepetE',
             'ksia~gZiaba' : 'zja~giaba',
             'Sa~ge' : 'Sa~gaj',
             'ksERtsi' : 'zERtiJi',
             'ksinga~' : 'tSia~',
             'ksio' : 'kzio',
             'ksio~EJy' : 'kzjo~gy',
             'kso~E' : 'kzjo~E',
             'ksit' : 'gzEts',
             'ksiiRi' : 'dizHtjEm',
             'ksamE' : 'samaks',
             'ksAE' : 'zw2~',
             'Engi' : 'EngujEn',
             'ksStREl' : 'keStR',
             'kso' : 'se es o',
             'ksEmm' : 've~gtjem',
             'ksOzo~' : 'ksOzo~',
             '@RiksOzo~' : '@Rikso~',
             'ksOzo~' : 'akesson',
             'ksavie' : 'eksavie',
             'kstytEti' : 'kestytEtis',
             'kskktOR' : 'iksfaktOR',
             'ke@slo' : 'tSe@slo',
             'ksipa' : 'zipas',
             'ksiv' : 'kezjem',
             ' ksE' : 'iksmEn',
             'ksERv@niak' : 'SERv@niak',
             'ksaR' : 'ksaR',
             'zja~' : 'zja~',
             'ksaZ' : 'zave',
             'ksila~g' : 'dZiJo~nge',
             'ksakiadaki' : 'ksalkiadakis',
             'ksp@nOl' : 'kop@nOl',
             'ks@Rnptlo~Zme' : 'z@Ruptlo~Zme',
             'ksijEm' : 'dikseptjEm',
             'kstRayy' : 'iksRay',
             'sdaRmstadete' : 'esdaRmstat',
             'bks@R' : 'byksjeR',
             'gRdmER' : 'gRa~dmER',
             'gRdesi' : 'gRa~desi',
             'gRdvikER' : 'gRa~dvikER',
             'gRdSa~' : 'gRa~dSa~',
             'gRdpREsiJi' : 'gRa~dpREsiJi',
             'gRdmEzo~' : 'gRa~dmEzo~',
             'gRdpRi' : 'gRa~dpRi',
             'gRdmaitR' : 'gRa~dmaitR',
             'gRdZa~' : 'gRa~dZa~',
             'gRdSoz' : 'gRa~dSoz',
             'gRdvEl' : 'gRa~dvEl',
             'gRdmEs' : 'gRa~dmEs',
             'gRdgijom' : 'gRa~dgijom',
             'gRdval' : 'gRa~dval',
             'gRdSav9~' : 'gRa~dSav9~',
             'dsEv' :'djuvs',
             'kdsa~t@n2f' : 'd9ksa~t@n2f',
             'ds@go' : 'djEgo',
             'dsd@Rika' : 'dj2d@Rik',
             'gbEj' : 'ZbEj',
             'ttky' : 'tajtoky',
             'fpyni' : 'paSpeyni',
             'vSikyl': 'veikl',
             'zzbRid' : 'zbRid',
             'pta' : ' peta',
             'ptSoRa' : 'petSoRa',
             'pteRobRa~S' : 'pteRobRa~S',
             'ptSa~ga' : 'petSa~ga',
             'pti' : 'pti',
             'mJa~' : 'miJa~',
             'fpkodnbRouse' : 'peSpek9dbRouse',
             'ZSalysin' : 'Zalysin',
             'ZSinEsykgE' : 'ZaSin Esykoge~',
             'ZSazitE' : 'Zezata~',
             'ZSazit' : 'Zezit',
             'ZSabijE' : 'ZabijE',
             'ZSabit' : 'Zabit',
             'bmna' : 'be emena',
             'mno' : 'emeno',
             'bZaRk':'bjaRk',
             'nkwa' : 'nikoi',
             'kta' : 'kekta',
             'kti' : 'setei',
             'kt9R' : 'st9R',
             'sktRua~pf' : 'StRumpf',
             'pljlist' : 'plilist',
             'dfa~dR' : 'defa~dR',
             'dfEt' : 'defEt',
             'mlia' : 'mia mlja',
             'pomla~Z@v9~' : 'pola~Z@v9~',
             'mla~Z' : 'mela~Z',
             'mla~kai' : 'mela~kai',
             'mlila' : 'emlila',
             'mlok' : 'emlok',
             'lvOv' : 'elvOv',
             'lviv' : 'elviv',
             'gzilofaZ' : 'zilofaZ',
             'gzou' : 'zou',
             'gza~zy' : 'zia~zy',
             'gzim' : 'igzim',
             'gzii' : 'tRezje',
             'gz@n' : 'Xim@nes',
             'mlz9~k' : 'melnik',
             'mke~' : 'meke~',
             'mkakasi' : 'kakatsi',
             'mkoaRti' : 'mekoaRti',
             'mkoaRti' : 'mekoaRti',
             'mkZij' : 'makZij',
             'mkylk' : 'makylk',
             'mkuR' : 'mekuR',
             'mku' : 'emku',
             'mkgR@gOR' : 'makgR@gOR',
             'mkle~' : 'mekle~',
             'mkZin' : 'ma gin',
             'mkolga~' : 'makolga~',
             'mka~n' : 'meka~n',
             'mkRiEZ' : 'makRiEZ',
             'mkasj' : 'makasj',
             'mkgRu' : 'mekgRu',
             'Rpo~dizZ' : 'Repo~dizZ',
             'Rpo~diR' : 'Repo~diR',
             'Rpo~dRe' : 'Repo~dRe',
             'Rpo~' : 'Repo~',
             'Rpo~ditil' : 'Repo~ditil',
             'Rpo~dE' : 'Repo~dE',
             'Rpo~di' : 'Repo~di',
             'Rpo~s' : 'Repo~s',
             'RpRima~d' : 'RepRima~d',
             'RpaR' : 'RepaR',
             'Rpo~dy' : 'Repo~dy',
             'fdZEld' : 'fjEld',
             'tle~Zi'  :'tingits',
             'tlaba' : 'telaba',
             'tlaSyak' : 'telyak',
             'tlepolEm' : 'tlepolEm',
             'gvis' : 'gwis',
             'gva' : 'gwa',
             'gve' : 'gwe~',
             'kvat@Rnik' : 'ekvat@Rnik',
             'kviaskOvski' : 'ekviaskOvski',
             'kvinikideE' : 'kvinikideE',
             'ngZi' : 'engugi',
             'gmaj' : 'dZimaj',
             'Rklam@REtil' : 'Reklam@REtil',
             'Rkolt' : 'Rekolt',
             'mjlwoks' : 'milwoks',
             'tsta~' : 'tseta~',
             'dklaR' : 'deklaR',
             'dkavj' : 'd9vil',
             'dkate~tkwatR' : 'd9ksviZ',
             'dkSaRZ' : 'deSaRZe',
             'vRZaka' : 'vZaRka',
             'mtskle' : 'm2tskle',
             'dbatE' : 'debatE',
             'dbaRas' : 'debaRas',
             'dgRada~' : 'degRada~',
             'ngRam' : 'engRam',
             'googl' : "gugol",
             'ngR@la' : 'negR@la',
             'vkskROR': 'v9skROR',
             'SwtataR' : 'S9StataR',
             'tsgyk@maR' : 'dZyk@maR',
             'bpifRa~s' : 'bepifRa~s',
             'bpi' : 'bepi',
             'kRwksonaR' : 'kRwadonoR',
             'kzpi' : 'Sopi',
             'bwtie' : 'bwatje',
             'kda~Sa' : "kode~Sa",
             'kdsa~t@n2f' : 'tRo~t@n2f',
             'kdi' : 'cjedei',
             'pRsizma~': 'pResizjo~',
             'ZtynEmaR' : 'ZotynEmaR',
             'tk@S@laSvili' : 'tek@S@laSvili',
             'tki' : 'toki',
             'tkEd' : 'tokEd',
             'ZRalma~' : 'ZeRalma~',
             'ZRal' : 'Ze~Ral',
             'ZR@vEl' : 'ZeR@vEl',
             'ZRi' : 'ZeRi',
             'SksftuR' : 'S2fuR',
             'dv9~' : 'dov9~',
             'dv@lOp' : 'dev@lOp',
             'lREt' : 'l2REt',
             'vnie' : 'vinje',
             'nguabi' : 'enguabi',
             'ngaRikORskm' : 'gaRikORsum',
             'ngazidZa' : "engazitSa",
             'ngliZama~' : 'engliZama~',
             'ngekou' : 'engekou',
             'ngliZa~s' : 'negliZa~s',
             'nnaZEn' : 'ennaZEn',
             'tbilisi' : 'tbelisi',
             'stfanik' : 'stefanik',
             'bziktaz' : 'beZitas',
             'tmwe~' : 'temo~',
             'bfR@mo~' : 'bofR@mo~',
             'kRwkRtuZ' : 'kRwaRuZ',
             'mbRiER' : 'bRejER',
             'vzliN' : 'vazliN',
             'fmin9~' : 'femin9~',
             'ljkaEno' : 'lekno',
             'btyl' : 'betyl',
             'bt@sla' : 'betola',
             'bta~do~' : 'beta~do~',
             'bta~kuR' : 'beta~kuR',
             'npRE' : 'nepRE',
             'Znbatist' : 'Za~batist',
             'ZnbatistESaRl' : 'Za~batistESaRl',
             'JnagboduJou@' : 'gnagboduJou@',
             'Sjlw@tHSap' : 'Silw@tHSap',
             'nko~' : 'nko~',
             'mssig' : 'metsik'}

In [None]:
FR_train_df['phonl_tr'] = FR_train_df['phonl_tr'].apply(fix_transcriptions, args=[FR_spelling, FR_abbr, FR_others])
FR_train_df['phont_tr'] = FR_train_df['phont_tr'].apply(fix_transcriptions, args=[FR_spelling, FR_abbr, FR_others])
FR_test_df['phonl_tr'] = FR_test_df['phonl_tr'].apply(fix_transcriptions, args=[FR_spelling, FR_abbr, FR_others])
FR_test_df['phont_tr'] = FR_test_df['phont_tr'].apply(fix_transcriptions, args=[FR_spelling, FR_abbr, FR_others])
FR_val_df['phonl_tr'] = FR_val_df['phonl_tr'].apply(fix_transcriptions, args=[FR_spelling, FR_abbr, FR_others])
FR_val_df['phont_tr'] = FR_val_df['phont_tr'].apply(fix_transcriptions, args=[FR_spelling, FR_abbr, FR_others])

In [None]:
# testing - lists should be empty

FRacr_train = find_acronyms(FR_train_df) #195
FRacr_test = find_acronyms(FR_test_df) #55
FRacr_val = find_acronyms(FR_val_df) #37


print(FRacr_train)
print(FRacr_test)
print(FRacr_val)

In [None]:
# save fixed french corpus

pkl_name = f"fix_frCV_df.pkl"
with open(pkl_name, 'wb') as file:
  # A new file will be created
  pkl.dump([FR_train_df, FR_test_df, FR_val_df], file)

### Add transcriptions to the dataset

In [None]:
def grxphoMAUS(dataset, transcr_df):
  
  
  """ creates a list with the phonological transcriptions"""
  all_transcribed_s = [ ]

  for batch in dataset:
    b_sent = batch['sentence']
    b_audio = batch['path']
    filen = os.path.split(b_audio)[-1] # getting rid of path
    filen_ok = os.path.splitext(filen)[0] # getting rid of ext

    pho_sent = transcr_df.loc[transcr_df['filename'] == filen_ok, 'phonl_tr']
    pho_sent_list = pho_sent.to_list()
    pho_sentence = pho_sent_list[0]
    all_transcribed_s.append(pho_sentence)

  return all_transcribed_s

In [None]:
print('*-------- starting IT transcriptions --------*')

with open('fix_itCV_df.pkl', 'rb') as file:
  IT_train_df, IT_test_df, IT_val_df = pkl.load(file)


transcriptions_trainIT = grxphoMAUS(common_voice_trainIT, IT_train_df)
transcriptions_testIT = grxphoMAUS(common_voice_testIT, IT_test_df)
transcriptions_valIT = grxphoMAUS(common_voice_validationIT, IT_val_df)

common_voice_trainIT = common_voice_trainIT.add_column("phonl_tr", transcriptions_trainIT)
common_voice_testIT = common_voice_testIT.add_column("phonl_tr", transcriptions_testIT)
common_voice_validationIT = common_voice_validationIT.add_column("phonl_tr", transcriptions_valIT)


print('*------- sentence transcriptions done -------*')

## ---------------- DOWNSAMPLING 

print('*------- downsampling 48kHz -> 16kHz -------*')

common_voice_trainIT = common_voice_trainIT.cast_column("audio", Audio(sampling_rate=16_000))
common_voice_testIT = common_voice_testIT.cast_column("audio", Audio(sampling_rate=16_000))
common_voice_validationIT = common_voice_validationIT.cast_column("audio", Audio(sampling_rate=16_000))

print(common_voice_trainIT[45])
print(common_voice_testIT[36])
print(common_voice_validationIT[23])


print('*------- IT dataset ready! -------*')

In [None]:
print('*-------- starting ES transcriptions --------*')


with open('fix_esCV_df.pkl', 'rb') as file:
  ES_train_df, ES_test_df, ES_val_df = pkl.load(file)


transcriptions_trainES = grxphoMAUS(common_voice_trainES, ES_train_df)
transcriptions_testES = grxphoMAUS(common_voice_testES, ES_test_df)
transcriptions_valES = grxphoMAUS(common_voice_validationES, ES_val_df)

common_voice_trainES = common_voice_trainES.add_column("phonl_tr", transcriptions_trainES)
common_voice_testES = common_voice_testES.add_column("phonl_tr", transcriptions_testES)
common_voice_validationES = common_voice_validationES.add_column("phonl_tr", transcriptions_valES)


print('*------- sentence transcriptions done -------*')

## ---------------- DOWNSAMPLING 

print('*------- downsampling 48kHz -> 16kHz -------*')

common_voice_trainES = common_voice_trainES.cast_column("audio", Audio(sampling_rate=16_000))
common_voice_testES = common_voice_testES.cast_column("audio", Audio(sampling_rate=16_000))
common_voice_validationES = common_voice_validationES.cast_column("audio", Audio(sampling_rate=16_000))

print(common_voice_trainES[-1])
print(common_voice_testES[-1])
print(common_voice_validationES[-2])


print('*------- ES dataset ready! -------*')

In [None]:
print('*-------- starting FR transcriptions --------*')

with open('fix_frCV_df.pkl', 'rb') as file:
  FR_train_df, FR_test_df, FR_val_df = pkl.load(file)

transcriptions_trainFR = grxphoMAUS(common_voice_trainFR, FR_train_df)
transcriptions_testFR = grxphoMAUS(common_voice_testFR, FR_test_df)
transcriptions_valFR = grxphoMAUS(common_voice_validationFR, FR_val_df)

common_voice_trainFR = common_voice_trainFR.add_column("phonl_tr", transcriptions_trainFR)
common_voice_testFR = common_voice_testFR.add_column("phonl_tr", transcriptions_testFR)
common_voice_validationFR = common_voice_validationFR.add_column("phonl_tr", transcriptions_valFR)


print('*------- sentence transcriptions done -------*')

## ---------------- DOWNSAMPLING 

print('*------- downsampling 48kHz -> 16kHz -------*')

common_voice_trainFR = common_voice_trainFR.cast_column("audio", Audio(sampling_rate=16_000))
common_voice_testFR = common_voice_testFR.cast_column("audio", Audio(sampling_rate=16_000))
common_voice_validationFR = common_voice_validationFR.cast_column("audio", Audio(sampling_rate=16_000))

print(common_voice_trainFR[-1])
print(common_voice_testFR[-1])
print(common_voice_validationFR[-2])


print('*------- FR dataset ready! -------*')

### Create the multilingual corpus

In [None]:
common_voice_trainMLitesfr = concatenate_datasets([common_voice_trainIT, common_voice_trainES, common_voice_trainFR])
common_voice_testMLitesfr = concatenate_datasets([common_voice_testIT, common_voice_testES, common_voice_testFR])
common_voice_validationMLitesfr = concatenate_datasets([common_voice_validationIT, common_voice_validationES, common_voice_validationFR])

print(f'number of files: TRAIN: {len(common_voice_trainMLitesfr)}, TEST:  {len(common_voice_testMLitesfr)}, VAL: {len(common_voice_validationMLitesfr)}')


MLlen_post_TR = computeTotLen(common_voice_trainMLitesfr)
MLlen_post_TST = computeTotLen(common_voice_testMLitesfr)
MLlen_post_VAL = computeTotLen(common_voice_validationMLitesfr)

print(f'ML dataset len in sec post filter TRAIN: {common_voice_trainMLitesfr} - TEST: {common_voice_testMLitesfr} - VAL {common_voice_validationMLitesfr}')

# save the multilingual corpus in a pkl

pkl_name = f"MLitesfrCVdataset7_transcribed20.pkl"
with open(pkl_name, 'wb') as file:
  # A new file will be created
  pkl.dump([common_voice_trainMLitesfr, common_voice_testMLitesfr, common_voice_validationMLitesfr], file)