 Notebook to generate Finnish phoneme representation of English words, for generating training data for sIAk model. 
The code is written intending to be a script. and a notebook based on the need.

- *Inputs*: 
    - Wordlist, text file with 1 word/sentence per line.
    - English Dict, word, phoneme representation form. *[hard coded for en_uk].
    - Finnish dict,
    - Eng to Global map. Pickle of a dictionary with {phoneme -> phoneme rep}
    - Finnish to Global Map.
    - Global phone distances.
    

- *Outputs*:
- *HyperParameters*:

In [1]:
import os
import numpy as np
import pandas as pd
from pprint import pprint


 Code to Run the below file as a script to generate english words to generate based on different distance 
 metrics and english and finnihs dictionaries. 

In [3]:

import argparse

parser = argparse.ArgumentParser(description='Outputs the Finnish Phoneme representation of the english words')

parser.add_argument("word_list",help="Path to the text file to be converted to Finnish phonetic symbols",type=str)
parser.add_argument("-ed","--english_dict",default="dict/en_uk_dict.txt")
parser.add_argument('--eng_to_global_map', default='../mappings/en_uk_ph_dist_phones_map.pkl',
                    help="Path to the english to global phoneme dictionary mapping", type=str)
parser.add_argument('--fin_to_global_map', default='../mappings/fin_2_global_phones_map.pkl',
                    help="Path to the finnish to global phoneme dictionary mapping", type=str)
parser.add_argument('--global_phone_distances', default='../mappings/global_phone_distances.pkl',
                    help="Path to the english to global phoneme dictionary mapping", type=str)
args = parser.parse_args()

In [2]:
def map_word_2_us_en_rep(sentence):
    '''
    inputs:
    sentence: (str) Sentence which needs to be represented in phones.
    outputs:
    transcripts (str)['ph ph ph'] Phoneme representation of sentence.
    '''
    
    # Save transcript for each word in the sentence in a list transcripts.
    words = sentence.split(' ')    
    transcripts = []
    for word in words:
        if word not in en_uk_dict.keys():
            return ''
        else:
            transcripts.append(en_uk_dict[word])
    
    #Take the transcripts in the transcript list and make a phoneme representation.
    # Taking care if it's a single word or a sentence.
    
    transcript = ''
    if len(transcripts) == 1:
        transcript += transcripts[0]
    else:
        for i in transcripts:
            transcript += i 
            transcript += " sil "

    return transcript.rstrip()

In [3]:
def eng_ph_2_global_ph(english_ph_transcript):
    '''
    input: 
    english_ph_transcript - English phone transcripts from mapping file. 'ph ph ph ...'
    output: 
    global_ph_transcript: 'ph ph ph ...'
    Hyperparameter: eng_to_global_map.
    '''
    global_ph_transcript = ''
    for phone in english_ph_transcript.split(" "):
        global_ph_transcript += eng_to_global_map[phone]
        global_ph_transcript += ' '
    
    return global_ph_transcript.rstrip()

In [4]:
def glob_transcript_2_fin_nearest(transcript):
    '''Returns the nearest finnish phoneme to each of the global transcript
       Input: Global phoneme representation seperated by a space
       Output: Finnish phoneme representation seperated by space.
    '''
    fin_transcript = ''
    for phone in transcript.split(" "):
         
        if type(global_2_fin_map[phone]) is list:
            fin_transcript += global_2_fin_map[phone][0]
        else:
            fin_transcript += global_2_fin_map[phone]       
        
        fin_transcript += ' '
    return fin_transcript.rstrip()

In [7]:
# Adding the words/sentences to be translated to a **mapping** file, where the further mappings can be representated.
mapping = pd.read_csv('eval_text/words.txt',header=None,names=['sentence'])

In [8]:
# Reading the english dictionary. 
# English Dictionary should be a text file with each line representeda s follows
# language_dialect_word [\t tab] phone[space]phone[space]...

en_uk_dict = pd.read_csv('dict/en_uk_dict.txt',header=None,names=['word','en_rep'],sep='\t')#,index_col=['word'])

#Removing 'language_dialect_' part
word_after_remov_en_uk = en_uk_dict.word.apply(lambda en_uk_word: en_uk_word.split('_')[-1])
en_uk_dict.set_index(word_after_remov_en_uk,inplace=True)
en_uk_dict.drop('word',axis=1,inplace=True)
pprint('English Phoneme Representation samples:')
pprint(en_uk_dict.head())

en_uk_dict = en_uk_dict.to_dict()['en_rep']
#Add the English phoneme representation to the mapping dataframe            
mapping =mapping.assign(eng_transcript= mapping.sentence.apply(map_word_2_us_en_rep))
mapping = mapping.assign(no_transcript_flag=(mapping.eng_transcript==''))
pprint('The Dataframe after mapping english words to phonemes:')
pprint(mapping.head())

'English Phoneme Representation samples:'
          en_rep
word            
'cause    kʰ ɒ z
'cause    kʰ ə z
'dominee     spn
'em          ə m
'n            n̩
'The Dataframe after mapping english words to phonemes:'
  sentence     eng_transcript  no_transcript_flag
0     girl             g ɜː ɫ               False
1    hello           h ɛ l əʊ               False
2     book              b ʊ k               False
3    learn             l ɜː n               False
4  bye bye  b aɪ sil b aɪ sil               False


In [9]:
# Read the English to Global rep and make the mapping in the file.


'''Reading the pprint english to global map dictionary
{'': '',
 'ɒ': 'ɒ',
 .
 .
 .
 }
'''

eng_to_global_map = pd.read_pickle('mappings/en_uk_ph_dist_phones_map.pkl')
#eng_to_global_map = pd.read_pickle(args.eng_to_global_map)

#Writing the map back to file for reference.
'''
with open('mappings/eng_to_global_map.txt', 'w') as f:
    for key, value in eng_to_global_map.items():
        f.write(key)
        f.write('    ')
        f.write(value)
        f.write("\n")
'''

mapping = mapping.assign(global_transcript = mapping.eng_transcript.apply(eng_ph_2_global_ph))

In [22]:
global_phone_2_text = mapping.set_index('sentence')
global_phone_2_text = global_phone_2_text[['global_transcript']]

for speaker in [555,585,600]:
    global_phone_2_text = global_phone_2_text.assign(speaker_id = len(global_phone_2_text)*[speaker])
    global_phone_2_text.to_csv('eval_text/eng_game_words_phone_rep_{}.txt'.format(speaker), sep='|', header=False)

In [9]:
# Read the Fin to Global Map and compute Global to Finnish Map.

fin_to_global_map = pd.read_pickle('mappings/fin_2_global_phones_map.pkl')
#fin_to_global_map = pd.read_pickle(args.fin_to_global_map)
global_2_fin_map = dict([[value,key] for key,value in fin_to_global_map.items()])

global_phone_dist = pd.read_pickle('mappings/phone_distances.pickle')

global_ph = global_phone_dist['phones']
global_ph_dist = global_phone_dist['phone_distances']

global_ph_dist = pd.DataFrame(data=global_ph_dist,index=global_ph.values(),columns=global_ph.values())

In [11]:
distance_2_fin = global_ph_dist.loc[global_2_fin_map.keys()]

for ph in global_ph.values():
    if ph not in global_2_fin_map.keys():
        three_nearest_phones = list(distance_2_fin[ph].sort_values()[:3].index)
        global_2_fin_map[ph] = three_nearest_phones
global_2_fin_map['w'] = 'v'
global_2_fin_map['z'] = 's'

mapping = mapping.assign(fin_transcript=mapping.global_transcript.apply(glob_transcript_2_fin_nearest))
pprint(mapping.head)

<bound method NDFrame.head of          sentence             eng_transcript  no_transcript_flag  \
0            girl                     g ɜː ɫ               False   
1           hello                   h ɛ l əʊ               False   
2            book                      b ʊ k               False   
3           learn                     l ɜː n               False   
4         bye bye          b aɪ sil b aɪ sil               False   
5         come on          k ʌ m sil ɒ n sil               False   
6            leaf                     l iː f               False   
7           plant                 p l ɑː n t               False   
8            tree                     t ɹ iː               False   
9            rock                      ɹ ɒ k               False   
10           life                     l aɪ f               False   
11          wrong                      ɹ ɒ ŋ               False   
12          round                   ɹ aʊ n d               False   
13            wet 

In [13]:
mapping.head()

Unnamed: 0,sentence,eng_transcript,no_transcript_flag,global_transcript,fin_transcript
0,girl,g ɜː ɫ,False,g ɜ ɫ,gː i l
1,hello,h ɛ l əʊ,False,h ɛ l əʊ,h i l y
2,book,b ʊ k,False,b ʊ k,b y k
3,learn,l ɜː n,False,l ɜ n,l i n
4,bye bye,b aɪ sil b aɪ sil,False,b aɪ sil b aɪ sil,b æe b æe


In [16]:
distance_2_fin['z'].sort_values()[:3].index

Index(['d', 'l', 'n'], dtype='object')

In [15]:
# Code looking only at the global symbols which are used in representing english pronunciation
# Checking if all the mappings make sense.

global_phones_of_interest = list(eng_to_global_map.values())
#for ph in global_phones_of_interest:
glob_2_fin_of_interest = {ph:global_2_fin_map[ph] for ph in global_phones_of_interest}
#pprint(glob_2_fin_of_interest)


mapping[(mapping.global_transcript.apply(lambda x: ' ʃ ' in x))]

# Loading the finnish dictionary to verify the mapping is fine.
fin_dict = pd.read_csv('dict/fi_child_ipa_dict.txt',header=None,names=['word','fin_rep'],sep='\t',index_col=['word'])
#fin_dict[fin_dict.fin_rep.apply(lambda rep: ' v ' in rep)]
#fin_dict = fin_dict.to_dict()['fin_rep']

In [49]:
def phone_sentences(sentence):
    sentence = sentence.replace("  ","<sil>")
    sentence = sentence.replace(" ","")
    sentence = sentence.replace("<sil>"," ")
    return sentence

In [54]:
def mappings_2_text(index='all',speakers=[600], name='eng_game_words'):
    '''
    Input : 
    Index - Index of the the mapping file to be written into a evaluation file.
                Default : all the words
    speakers - List of speakers the be generated by tacotron.
    name - Name of the output text file
    
    Output:
    None, Saves the file in the eval_text/folder. 
    '''
    if index == 'all':
        transcript_2_text = mapping[['sentence','fin_transcript']]
    else:
        transcript_2_text = mapping.iloc[index][['sentence','fin_transcript']]

    transcript_2_text.set_index('sentence',inplace=True)
    transcript_2_text = transcript_2_text.assign(
                                fin_transcript=transcript_2_text.fin_transcript.apply(phone_sentences))
    for speaker in speakers:
        transcript_2_text = transcript_2_text.assign(speaker_id = len(transcript_2_text)*[speaker])
        
        from pathlib import Path
        my_file = Path('eval_text/{}_{}.txt'.format(name,speaker))
        
        if my_file.is_file():
            print('eval_text/{}_{}.txt already exists; not writing the file'.format(name,speaker))
        else:    
            transcript_2_text.to_csv('eval_text/{}_{}.txt'.format(name,speaker), sep='|', header=False)
    
    return 0

In [293]:
mapping.to_excel('mappings/word_wmappings_new.xlsx')

In [148]:
'''
with open('mappings/global_to_fin_distance_map.txt', 'w') as f:
    for key, value in global_2_fin_map.items():
        f.write(key)
        f.write('    ')
        f.write(str(value))
        f.write("\n")

with open('mappings/global_to_fin_map.txt', 'w') as f:
    for key, value in global_2_fin_map.items():
        f.write(key)
        f.write('    ')
        f.write(value)
        f.write("\n")
'''

In [55]:
#Savings words to genrate which have same phonemic representation in Finnish and English.

indexes = mapping[mapping.eng_transcript == mapping.fin_transcript].index
np.random.seed(2)
speakers = np.random.randint(low = 550, high =601, size=10)
mappings_2_text(index=indexes, speakers=speakers,name='eng_Fin_same_rep')

eval_text/eng_Fin_same_rep_590.txt already exists; not writing the file


0

In [56]:
#Savings words to genrate which have same phonemic representation in Finnish and English.

indexes = mapping[mapping.eng_transcript == mapping.fin_transcript].index
np.random.seed(2)
speakers = np.random.randint(low = 1, high =550, size=10)

mappings_2_text(index=indexes, speakers=speakers,name='adult_eng_Fin_same_rep')

0

In [53]:
d = mapping.head()

a = d.fin_transcript
a.aplly()