In [197]:
from wiktionaryparser import WiktionaryParser
import re
import pandas as pd
from tqdm import auto as tqdm
import json

In [2]:
parser = WiktionaryParser()

In [3]:
pronunciation_regexp = re.compile("(?P<context>.*\s+)?IPA:\s+(?P<ipa>.*)")
ipa_regexp = re.compile("(\(.*\)\s+)?\/(.*?)\/")

def clean_pronunciations(pronunciations):
    for pronunciation in pronunciations:
        for output in clean_pronunciation(pronunciation):
            yield output

def clean_pronunciation(pronunciation):
    for (context, ipas) in pronunciation_regexp.findall(pronunciation):
        for additional_context, ipa in ipa_regexp.findall(ipas):
            complete_context = f"{context} | {additional_context}".strip()
            yield {"context": complete_context, "ipa": ipa.strip()}
            
def get_ipa_for_word(word, language):
    for word in parser.fetch(word, language):
        yield from clean_pronunciations(word['pronunciations']['text'])

def get_dataset(words, language):
    return pd.DataFrame([
        dict(item, word=word)
        for word in tqdm.tqdm(words)
        for item in get_ipa_for_word(word, language)
    ])

In [4]:
words = pd.read_csv("oxford-5k.csv")

In [5]:
words_sample = words.word.tolist()

In [6]:
dataset_sample = get_dataset(words_sample, "english")

  0%|          | 0/5948 [00:00<?, ?it/s]

In [7]:
dataset_sample

Unnamed: 0,context,ipa,word
0,"(UK, US) |",eɪ,a
1,(General Australian) |,æɪ,a
2,(phoneme) |,æ,a
3,(phoneme) |,ɑː,a
4,(phoneme) |,eɪ,a
...,...,...,...
16359,"(Received Pronunciation, General Australian, N...",ˈzɪə.ɹəʊ,zero
16360,"(General American) enPR: zîrʹō('), zēʹrō('), |",ˈzɪɚ(ˌ)oʊ,zero
16361,"(General American) enPR: zîrʹō('), zēʹrō('), |",ˈzi(ˌ)ɹoʊ,zero
16362,"(General American) enPR: zōn, |",zoʊn,zone


In [8]:
for row in dataset_sample.sample(10).itertuples():
    print(f"{row.word}, given the context {row.context} is pronounced like /{row.ipa}/" )

list, given the context enPR: lĭst,  | is pronounced like /lɪst/
music, given the context (US)  | is pronounced like /ˈmjuzɪk/
tell, given the context (UK, US) enPR: tĕl,  | is pronounced like /tʰɛɫ/
slash, given the context (US)  | is pronounced like /slæʃ/
secular, given the context (UK)  | is pronounced like /ˈsɛkjʊlə/
survey, given the context (UK)  | is pronounced like /sɜːˈveɪ/
home, given the context (US) enPR: hōm,  | is pronounced like /hoʊm/
real, given the context enPR: rēəl, riəl, rēl,  | is pronounced like /ˈɹiːəl/
call, given the context (US, cot–caught merger)  | is pronounced like /kɑl/
work, given the context (Liverpool)  | is pronounced like /wɛːk/


In [50]:
dataset = dataset_sample.drop_duplicates().reset_index(drop=True)

In [174]:
context_regexp = re.compile("\((.*?)\)")
enpr_regexp = re.compile("enPR:(.*)\W")

def clean_context(context):
    outputs = []
    outputs = outputs + context_regexp.findall(context)
    if 'enPR' in context:
        outputs += [f"enPR:{item}" for item in enpr_regexp.findall(context)]
    return ", ".join(outputs)

dataset = dataset.assign(context_proc=dataset.context.apply(clean_context))

In [175]:
dataset.to_csv("dataset.csv")

In [176]:
dataset.to_parquet("dataset.parquet")

NameError: name 'dataset' is not defined