### Installings

In [2]:
import pandas as pd
import random
import re
import json
from IPython.display import display, HTML
import os
import numpy as np
import csv
from pprint import pprint
import pickle as pkl

We have to work on the transcriptions, so we can start wither from the pkl files where the phonetic and phonological transcriptions are stored (see Notebook multilingual_corpis.ipynb) or from the pkl file of the multilingual corpus.

In [7]:
# Load files with transcriptions 

# set the directory

%cd /mypath/transcriptions

# IT
with open('fix_itCV_df.pkl', 'rb') as file:
  IT_train_df, IT_test_df, IT_val_df = pkl.load(file)

  # ES
with open('fix_esCV_df.pkl', 'rb') as file:
  ES_train_df, ES_test_df, ES_val_df = pkl.load(file)

# FR
with open('fix_frCV_df.pkl', 'rb') as file:
  FR_train_df, FR_test_df, FR_val_df = pkl.load(file)

In [25]:
# Concatenate datasets

ITdataset = pd.concat([IT_train_df, IT_test_df, IT_val_df],  ignore_index=True)

ESdataset = pd.concat([ES_train_df, ES_test_df, ES_val_df],  ignore_index=True)

FRdataset = pd.concat([FR_train_df, FR_test_df, FR_val_df],  ignore_index=True)

MLdataset = pd.concat([ITdataset, ESdataset, FRdataset])

Extract the list of single phonemes

In [10]:
# SINGLE PHONEMES

ITphoClasses = {'a': 0, 'e': 0, 'o': 0, 'O': 0, 'E': 0,'i': 0, 'u': 0,  # vowels
                    'j': 1, 'w': 1,                                                 # glides
                    'r': 2,                                                         # rhotic
                    'l': 3, 'L': 3,                                                 # lateral
                    'm': 4, 'n': 4, 'J': 4, 'N': 4,                                 # nasals
                    's': 5, 'z': 5,                                                 # fricatives I (sibilants)
                    'v': 6, 'f': 6,                                                 # fricatives II
                    'Z': 7, 'S': 7,                                                 # frivatives III
                    'dz': 8, 'dZ': 8, 'tts': 8, 'ttS': 8, 'ts': 8, 'tS': 8, 'ddz': 8, 'ddZ': 8,  # affriactes + affr geminates
                    'b': 9,'d': 9, 'g': 9, 'k': 9, 'p': 9, 't': 9}


ESphoClasses = {'e': 0, 'i': 0, 'o': 0,'u': 0, 'a': 0,
                'j': 1, 'w': 1,
                'r': 2,
                'l': 3,'L': 3,
                'm': 4, 'n': 4, 'J': 4, 'N': 4,
                's': 5,
                'v': 6, 'B': 6, 'D': 6, 'G': 6, 'Z': 6, 'S': 6, 'T': 6, 'x': 6,
                'f': 7,
                'ddZ': 8, 'dZ': 8, 'tS': 8, 'ttS': 8,
                'b': 9, 'd': 9, 'g': 9, 'k': 9, 'p': 9,'t': 9}


FRphoClasses = {'e': 0, 'i': 0, 'o': 0,'u': 0, 'a': 0, '@': 0, 'E':0, 'O':0, 'y':0, '2':0, '9':0,
                  'A':0, 'e~':0, 'a~':0, 'o~':0, '9~':0,
                   'j': 1, 'w': 1, 'H':1,
                   'R': 2,
                   'l': 3,
                   'm': 4, 'n': 4, 'J': 4, 'N': 4,
                   's': 5,
                   'v': 6, 'f': 6, 'z': 6, 'X': 6, 'Z': 6, 'S': 6,
                   'ddz': 8, 'dz': 8, 'ts': 8, 'tts': 8, 'ddZ' : 8, 'dZ' : 8, 'tS' : 8, 'ttS' : 8,
                   'b': 9, 'd': 9, 'g': 9, 'k': 9, 'p': 9,'t': 9}

segmIT_list = [k for k in ITphoClasses.keys()]
print(len(segmIT_list))
segmES_list = [k for k in ESphoClasses.keys()]
print(len(segmES_list))
segmFR_list = [k for k in FRphoClasses.keys()]
print(len(segmFR_list))

segmMLlist = list(set(segmIT_list + segmES_list + segmFR_list))
print('size mlt phonemes', len(segmMLlist))
print(segmMLlist)


MLphoClasses = {'e': 0, 'i': 0, 'o': 0,'u': 0, 'a': 0, '@': 0, 'E':0, 'O':0, 'y':0, '2':0, '9':0, # vowels
                 'A':0, 'e~':0, 'a~':0, 'o~':0, '9~':0, '~': 11,
                 'j': 1, 'w': 1, 'H':1,
                 'r': 2, 'R': 2,
                 'l': 3,'L': 3,
                 'm': 4, 'n': 4, 'J': 4, 'N': 4,
                 's': 5, 'z': 5,
                 'v': 6, 'f': 6,
                 'B': 7, 'D': 7, 'G': 7, 'Z': 7, 'S': 7, 'T': 7, 'x': 7, 'X': 7,
                 'ddz': 8, 'dz': 8, 'ts': 8, 'tts': 8, 'ddZ' : 8, 'dZ' : 8, 'tS' : 8, 'ttS' : 8,
                 'b': 9, 'd': 9, 'g': 9, 'k': 9, 'p': 9,'t': 9}

36
34
46
size mlt phonemes 53
['l', 'A', 'n', 'b', 'R', 'g', 'tts', 'ddz', 'p', 'T', 'x', 'o~', '9', 'G', '2', '@', 'z', 'a', 'y', 'e~', 'e', 't', 'u', 'Z', 'B', 'dz', 'ts', 'r', 'ttS', 'H', 'k', 'D', 'L', 'S', 'a~', 'm', 'j', 'J', 'N', 'dZ', 'X', 's', '9~', 'w', 'v', 'o', 'tS', 'O', 'i', 'd', 'ddZ', 'E', 'f']


### Multilingual syllabifier

In [20]:
def SyllabifierML(text):

  """Takes a transcribed sentence as an input and returns it syllabified. Words are separated by pipes and 
  syllables within each word are separated by white spaces"""

  MLTnucleus = ['a', 'e', 'o', 'i', 'u', '@', 'E', 'O', 'y', '2', '9', 'A', 'e~', 'a~', 'o~', '9~']

  MLTphoClasses = {'e': 0, 'i': 0, 'o': 0,'u': 0, 'a': 0, '@': 0, 'E':0, 'O':0, 'y':0, '2':0, '9':0, # vowels
                 'A':0, 'e~':0, 'a~':0, 'o~':0, '9~':0, '~': 11,
                 'j': 1, 'w': 1, 'H':1,
                 'r': 2, 'R': 2,
                 'l': 3,'L': 3,
                 'm': 4, 'n': 4, 'J': 4, 'N': 4,
                 's': 5, 'z': 5,
                 'v': 6, 'f': 6,
                 'B': 7, 'D': 7, 'G': 7, 'Z': 7, 'S': 7, 'T': 7, 'x': 7, 'X': 7,
                 'ddz': 8, 'dz': 8, 'ts': 8, 'tts': 8, 'ddZ' : 8, 'dZ' : 8, 'tS' : 8, 'ttS' : 8,
                 'b': 9, 'd': 9, 'g': 9, 'k': 9, 'p': 9,'t': 9}


  # set the word separator

  listlist = [ ]

  # print('splitting sent') # words in list to avoid syllabification outside word boundaries

  phone_delimiter_token=" "
  word_delimiter_token=" | "

  word_delimiter_tok = word_delimiter_token #+ ' '
  pho_delimiter_tok = phone_delimiter_token

  #### EXAMPLE ------------------- 'nessun lavoro sErjo di applikattsjone E denari in abbondantsa'

  splitted_sent = text.split(pho_delimiter_tok)
  # print('splitted_sent:', splitted_sent)  ### --------------- ['nessun', 'lavoro', 'sErjo', 'di', 'applikattsjone', 'E', 'denari', 'in', 'abbondantsa']

  raw_MOP_sent = [ ] # splits after each vowel

  for item in splitted_sent: # sillabification starts here

    raw_MOP_w = ""

    for seg in item:
      if seg in MLTnucleus:
        raw_MOP_w += seg + phone_delimiter_token
      elif seg == '~':
        raw_MOP_w = raw_MOP_w[ :-1] + '~' + phone_delimiter_token

        # print('raw_MOP~',  raw_MOP_w)

      else:
        raw_MOP_w += seg

    raw_MOP_sent.append(raw_MOP_w.strip())

  # print('raw MOP___: ', raw_MOP_sent)  ### --------------- ['ne ssu n', 'la vo ro', 'sE rjo', 'di', 'a ppli ka ttsjo ne', 'E', 'de na ri', 'i n', 'a bbo nda ntsa']


  # syllables in list to check if they're valid or violate SSP

  for i, rawSyl_w in enumerate(raw_MOP_sent):
    raw_MOP_sent_list = rawSyl_w.split(phone_delimiter_token)
    # print('word to check ', i, raw_MOP_sent_list) ### ---------------  0 ['ne', 'ssu', 'n']


    # SSP check

    """every segment is mappend into a class tht has a numeric ID according to the sonority.
    Onset segments of the syllables of the rough MOP tokenization are asigned with a sonority ID
    that is appended to a list; if the list matches with one of the allowed combinatios the syllable
    is valid (SSP = True), otherwise the problematic onsets that violates the SSP are appended as coda of the previous syllable"""

    ok_MOP_sent = [ ]



    SSP_allowed = [[1, 0], [2, 0], [3, 0], [4, 0], [5, 0], [6, 0], [7, 0], [8, 0], [9, 0], # CV
                [1, 0, 11], [2, 0, 11], [3, 0, 11], [4, 0, 11], [5, 0, 11], [6, 0, 11],  # CV~
                [7, 0, 11], [8, 0, 11], [9, 0, 11], [5, 1, 0, 11], [5, 1, 0],            # CV~
                [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], # geminates
                [9 ,9, 3] ,[9 ,9, 2], # geminates + liquids
                [2, 1], [3, 1], [4, 1], [5, 1], [6, 1], [7, 1], [8, 1], [9, 1],# everything + approximants
                [5, 2], [6, 2], [7, 2], [9, 2], # fric plosives + /r/  # ------------------- ok
                [5, 3], [6, 3], [7, 3], [9, 3],  # fric, plosives + /l/ # ------------------- ok
                [5, 4], [7, 4], [9, 4],   # + nasali - con le plosive ci sono eccezioni tolto [8, 4] - pneumatico
                [9, 5],  # + /s/ /z/ - con le plosive ci sono eccezioni psicologo
                [7, 6], [7, 9, 2],  [8, 6],  # + /S/ /Z/ - con le plosive ci sono eccezioni
                [5, 9], [5, 6],  # s impura
                [5, 6, 1],  [5, 6, 2], [5, 6, 3], [5, 9, 1],  [5, 9, 2], [5, 9, 3], # s impura + CC --------- ok
                [5, 6, 2, 1], [5, 6, 3, 1], [5, 9, 2, 1], [5, 9, 3, 1], # s impura + CC + glide --------- ok
                [9, 9, 5], [9, 9, 6], [9, 9, 5, 1], [9, 9, 6, 1],
                  [9, 5, 9], [6, 9]] # per gestire affricate


    current_syl = ''

    for i, syl in enumerate(raw_MOP_sent_list):
      seg_class = [ ]
      onset = syl[:-1]

      if len(onset) > 1:
        for ch in onset:
          seg_class.append(MLTphoClasses[ch]) ###class del carattere

        if seg_class in SSP_allowed:
          MOP_ok = True
        else:
          MOP_ok = False

      else:
        MOP_ok = True # if the onset is only one C it is always legit

      ok_MOP_sent.append([syl, MOP_ok])



    # print('ok_MOP_sent: ', ok_MOP_sent)     ### ---------------  [['a', True], ['bbo', True], ['nda', False], ['ntsa', False]]
                                            ### ---------------  0 [['ne', True], ['ssu', True], ['n', True]]


    # fixing syl that violate the MOP


    final_syl_sent = [ ]

    current_syl = None

    for i, syl_MOP in enumerate(ok_MOP_sent):
      if syl_MOP[1] == True:
        current_syl = syl_MOP[0]
        final_syl_sent.append(current_syl)

      if syl_MOP[1] == False:
        try:
          prev_syll = final_syl_sent[-1]
          probl = syl_MOP[0][0]
          prev_syll = prev_syll+probl
          final_syl_sent.pop()
          final_syl_sent.append(prev_syll)

          ok_syl = syl_MOP[0].replace(probl, "")
          current_syl = ok_syl
          final_syl_sent.append(current_syl)

        except IndexError: # fixes if onset is unseparable consonants(exceptions, names)
            current_syl = syl_MOP[0]
            final_syl_sent.append(current_syl)



    if len(final_syl_sent[-1]) == 1 and final_syl_sent[-1] not in MLTnucleus: # fixes if last syllable is a consonant alone
      probl_coda = final_syl_sent[-1]
      try:
        final_syl_sent.pop()
        last_s = final_syl_sent[-1]
        final_syl_sent.pop()
        fixed_last = last_s + probl_coda
        final_syl_sent.append(fixed_last)
      except IndexError:
        # print('i have a problem with', probl_coda)
            if probl_coda in 'dl':  #
              final_syl_sent.append(probl_coda) #
            else:
              final_syl_sent.append(probl_coda+'e') #


    if final_syl_sent[-1] not in 'dl' and not any(substring in final_syl_sent[-1] for substring in MLTnucleus):

      last_Csyl =  final_syl_sent[-1]

      final_syl_sent.pop()
      last_s = final_syl_sent[-1]

      final_syl_sent.pop()
      fixed_last = last_s + last_Csyl

      final_syl_sent.append(fixed_last)

    else:
      pass


    syl_w_str = phone_delimiter_token.join(final_syl_sent)

    listlist.append(syl_w_str+ word_delimiter_tok)
    # listlist.append(syl_w_str+' '+ word_delimiter_tok)

  text = ''.join(listlist)
  return text


In [21]:
# Examples

print('it example:  ', SyllabifierML('tSi sono troppE koze da dZatSinto'))
print('fr example:  ', SyllabifierML('o no~ dy guvERn@ma~ Z@ ma~ R@mE do~k oZuRdHi a la saZEs d@ l asa~ble'))
print('es example:  ', SyllabifierML('una BoT femenina le Grita DezDe lexos'))
print('fr example:  ', SyllabifierML('m@sj2 pOl a REzo~ il fo k@ s@la sERv d Egza~pl'))

it example:   tSi | so no | tro ppE | ko ze | da | dZat Sin to | 
fr example:   o | no~ | dy | gu vER n@ ma~ | Z@ | ma~ | R@ mE | do~k | o ZuR dHi | a | la | sa ZEs | d@ | l | a sa~ ble | 
es example:   u na | BoT | fe me ni na | le | Gri ta | Dez De | le xos | 
fr example:   m@ sj2 | pOl | a | RE zo~ | il | fo | k@ | s@ la | sERv | d | Eg za~pl | 


### Build the syllable-based vocabulary

Our vocabulary will contain the most frequent syllables in the multilingual corpus and all the single phonemes to compose less frequent syllables. This way we will be able to avoid data sparsity, a problem that may occurr due to the fact that we are working with a limited amount of data.

In [24]:
def syll_in_list(sent):
  x = sent.split()
  return x


def syl_count(list_syl):
  counts = {}
  for item in list_syl:
    counts[item] = counts.get(item, 0) + 1
  least_frequentSY = [ k for k, v in counts.items() if v < 3500] # by modifying this threshold we define the vocbaulary size

  # print('least_frequentSY_____________', least_frequentSY[:10], len(least_frequentSY))
  return counts, least_frequentSY

Extract the most frequent syllables and set the vocabulary size through the frequency threshold

In [26]:
#(1) syllabify each sentence
MLdataset['phonl_tr'] = MLdataset['phonl_tr'].apply(SyllabifierML)

#(2) each row of the dataset as a list of syllables
MLdataset['phonl_tr'] = MLdataset['phonl_tr'].apply(syll_in_list)

# (3) extraction of the whole list of syllables + clean pipe
ML_syl_lists = [l for l in MLdataset['phonl_tr']] 
ML_syl_lists = [item for sublist in ML_syl_lists for item in sublist] # flat syl list
MLallSyl = [ el for el in ML_syl_lists if el != '|' ] #  -no pipe



# (4) syllable frequency count + less frequent syllables list
MLsylCount, ML_leastFrSyl = syl_count(MLallSyl) 
print(len(ML_leastFrSyl))

# (5) complete syllable set
ML_syl_list = list(set(ML_syl_lists)) 
print(len(ML_syl_list))

# (6) most frequent syllables 
MLmostFsyl = list(set(MLallSyl) - set(ML_leastFrSyl))
print(len(MLmostFsyl))

15391
15593
201


Build the vocabulary

In [27]:
MLphoClasses = {'e': 0, 'i': 0, 'o': 0,'u': 0, 'a': 0, '@': 0, 'E':0, 'O':0, 'y':0, '2':0, '9':0, # vowels
                 'A':0, 'e~':0, 'a~':0, 'o~':0, '9~':0, '~': 11,
                 'j': 1, 'w': 1, 'H':1,
                 'r': 2, 'R': 2,
                 'l': 3,'L': 3,
                 'm': 4, 'n': 4, 'J': 4, 'N': 4,
                 's': 5, 'z': 5,
                 'v': 6, 'f': 6,
                 'B': 7, 'D': 7, 'G': 7, 'Z': 7, 'S': 7, 'T': 7, 'x': 7, 'X': 7,
                 'ddz': 8, 'dz': 8, 'ts': 8, 'tts': 8, 'ddZ' : 8, 'dZ' : 8, 'tS' : 8, 'ttS' : 8,
                 'b': 9, 'd': 9, 'g': 9, 'k': 9, 'p': 9,'t': 9}


segmML_list = [k for k in MLphoClasses.keys()] # single phonemes

sylMLT_list = MLmostFsyl

MLTsylphoVocab = segmML_list + sylMLT_list
print(len(MLTsylphoVocab))

setlist = list(set(MLTsylphoVocab))
print(len(setlist))

vocab_dict = {v: k for k, v in enumerate(sorted(setlist))}
vocab_dict


# Let's add " " as a more visible character, |.
vocab_dict["|"] = len(vocab_dict)

# # Then, we  add an "unknown" token so that the model can later deal with characters not encountered in CV's training set.
# # We also add the unknown label and the pad token (CTC blank token) to the dictionary.


vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

vocab_dict

255
243


{'2': 0,
 '9': 1,
 '9~': 2,
 '@': 3,
 'A': 4,
 'B': 5,
 'Ba': 6,
 'Be': 7,
 'Bi': 8,
 'Bo': 9,
 'D': 10,
 'Da': 11,
 'DaT': 12,
 'De': 13,
 'Di': 14,
 'Do': 15,
 'Dos': 16,
 'E': 17,
 'El': 18,
 'G': 19,
 'Ga': 20,
 'Go': 21,
 'H': 22,
 'J': 23,
 'L': 24,
 'N': 25,
 'O': 26,
 'R': 27,
 'R@': 28,
 'RE': 29,
 'Ra': 30,
 'Ra~': 31,
 'Re': 32,
 'Ri': 33,
 'Ro': 34,
 'Ry': 35,
 'S': 36,
 'Sa': 37,
 'Se': 38,
 'Si': 39,
 'T': 40,
 'Ta': 41,
 'Te': 42,
 'Ti': 43,
 'Tja': 44,
 'Tjo': 45,
 'Tjon': 46,
 'X': 47,
 'Z': 48,
 'Z@': 49,
 'Ze': 50,
 'Zi': 51,
 'Zo': 52,
 'a': 53,
 'al': 54,
 'an': 55,
 'ar': 56,
 'a~': 57,
 'b': 58,
 'ba': 59,
 'be': 60,
 'bi': 61,
 'bo': 62,
 'd': 63,
 'd@': 64,
 'dZ': 65,
 'da': 66,
 'da~': 67,
 'ddZ': 68,
 'ddz': 69,
 'de': 70,
 'del': 71,
 'di': 72,
 'do': 73,
 'du': 74,
 'dy': 75,
 'dz': 76,
 'e': 77,
 'el': 78,
 'en': 79,
 'es': 80,
 'e~': 81,
 'f': 82,
 'fa': 83,
 'fe': 84,
 'fi': 85,
 'for': 86,
 'fu': 87,
 'fwe': 88,
 'g': 89,
 'ga': 90,
 'go': 91,
 'i': 92,

Save the vocabulary in a json file

In [36]:
lang = 'ML'
units = 'PhoSyl'
u_count = len(vocab_dict)
# saving it as a json file in the current directory
vocab_name = f'vocab{lang}_{units}{u_count}.json'
with open(vocab_name, 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

Create and save the tokenizer folder based on the vocabulary we just created and on the custom tokenizer that works according to the syllabification rules.

In [44]:
from CustomML_ITESFRPhoSylCTCTokenizer import *

In [45]:

tokenizer = HybridML_ITESFRPhoSylCTCTokenizer(f'./vocab{lang}_{units}{u_count}.json', unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token=" | ")
tokenizer.save_pretrained(f"./tokenizer{lang}_hyb{units}{u_count}")

('./tokenizerML_hybPhoSyl246\\tokenizer_config.json',
 './tokenizerML_hybPhoSyl246\\special_tokens_map.json',
 './tokenizerML_hybPhoSyl246\\vocab.json',
 './tokenizerML_hybPhoSyl246\\added_tokens.json')