## Tables

Brief bit of code for converting the database info into a README or latex table.



In [1]:
import datasets
import os
import pandas as pd
import json
import numpy as np

config_file = '../childes_processor/phonemizer_config.json'
childes_folder = '../CHILDES-dataset'

collection_map = {
    'basque' : 'Other/Basque',
    'dutch' : 'DutchAfricaans/Dutch',
    'englishNA' : 'Eng-NA',
    'englishUK' : 'Eng-UK',
    'indonesian' : 'EastAsian/Indonesian',
    'mandarin' : 'Chinese/Mandarin',
    'serbian' : 'Slavic/Serbian',
    'estonian' : 'Other/Estonian',
    'cantonese' : 'Chinese/Cantonese',
    'polish' : 'Slavic/Polish',
    'swedish' : 'Scandinavian/Swedish',
    'portuguesept' : 'Romance/Portuguese',
    'portuguesebr' : 'Romance/Portuguese',
    'korean' : 'EastAsian/Korean',
    'italian' : 'Romance/Italian',
    'catalan' : 'Romance/Catalan',
    'croatian' : 'Slavic/Croatian',
    'welsh' : 'Celtic/Welsh',
    'icelandic' : 'Scandinavian/Icelandic',
    'danish' : 'Scandinavian/Danish',
    'norwegian' : 'Scandinavian/Norwegian',
    'hungarian' : 'Other/Hungarian',
    'romaninian' : 'Other/Romanian',
    'irish' : 'Celtic/Irish',
    'turkish' : 'Other/Turkish',
    'quechua' : 'Other/Quechua',
    'farsi' : 'Other/Farsi',
}

PHONEME_SETS = {
    'basque' : 2161,
    'cantonese' : 2309,
    'catalan' : 2555,
    'croatian' : 1139,
    'danish' : 2265,
    'dutch' : 2405,
    'englishna' : 2175,
    'englishuk' : 2252,
    'estonian' : 2181,
    'farsi' : 516,
    'french' : 2269,
    'german' : 2398,
    'hungarian' : 2191,
    'icelandic' : 2568,
    'indonesian' : 1690,
    'italian' : 1145,
    'irish' : 2521,
    'japanese' : 2196,
    'korean' : 423,
    'mandarin' : 2457,
    'norwegian' : 499,
    'polish' : 1046,
    'romanian' : 2443,
    'serbian' : 2499,
    'spanish' : 164,
    'swedish' : 1150,
    'portuguesept' : 2206,
    'portuguesebr' : 2207,
    'quechua' : 104,
    'turkish' : 2217,
    'welsh' : 2406,
}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
phoible = pd.read_csv('../../../data/phoible.csv')
phonemes = phoible.Phoneme.unique()
TONES = '˧˥˩̰˨˩˦'

def get_phoneme_set(lines):
    token_counts = {}
    for line in lines:
        # Our tool combines tone markers with the preceeding vowel, we remove tone markers in our comparison so that we don't get many "unknown phonemes" consisting of a known vowel + tone marker.
        #line = line.replace('˧˥', '').replace('˧˩̰', '').replace('˩˧', '').replace('˨', '').replace('˥', '').replace('˧', '').replace('˧˥', '').replace('˧˩̰', '').replace('˩˧','').replace('˩','').replace('˦','')
        tokens = line.strip().split()
        for token in tokens:
            if token == 'WORD_BOUNDARY':
                continue
            if token not in token_counts:
                token_counts[token] = 0
            token_counts[token] += 1
    vowels = []
    consonants = []
    other = []
    for phoneme in token_counts:
        cmp_phoneme = phoneme
        if phoneme not in phonemes:
            has_tones = False
            for tone in TONES:
                if tone in phoneme:
                    has_tones = True
                    cmp_phoneme = cmp_phoneme.replace(tone, '')
            if not has_tones or cmp_phoneme not in phonemes:
                print(f'{phoneme} not in phoible')
                other.append(phoneme)
                continue
        if phoible[phoible.Phoneme == cmp_phoneme].SegmentClass.iloc[0] == 'vowel':
            vowels.append(phoneme)
        elif phoible[phoible.Phoneme == cmp_phoneme].SegmentClass.iloc[0] == 'consonant':
            consonants.append(phoneme)
        else:
            other.append(phoneme)

    return vowels, consonants, other

  phoible = pd.read_csv('../../../data/phoible.csv')


In [3]:
columns = ['Language', 'CHILDES Collection', 'Corpora', 'Backend', 'Language Code', 'Inventory ID', 'Description',
           'Speakers', 'Utterances', 'Words', 'Phonemes',
           '% Child', 'Phoneme Types', 'Consonants', 'Vowels']

data = {column: [] for column in columns}

# load json config
config = json.load(open(config_file))

for config_name in datasets.get_dataset_config_names(childes_folder)[::-1]:
    print('\n' + config_name + '...')
    
    dataset = datasets.load_dataset('../CHILDES-dataset', config_name)['train']

    config_name = 'EnglishNA' if config_name == 'English' else config_name
    language = config_name
    config_name = config_name.lower()

    collection = collection_map[config_name] if config_name in collection_map else language
    corpora = len(set(dataset['corpus_id']))
    backend = config[config_name]['backend']
    lang_code = config[config_name]['language']
    inventory_id = PHONEME_SETS[config_name]
    num_corpora = len(set(dataset['corpus_id']))
    speakers = len(set(dataset['speaker_id']))
    total_utterances = len(dataset)
    total_words = sum([utterance.count('WORD_BOUNDARY') for utterance in list(dataset['phonemized_utterance'])])
    total_phonemes = sum([len(utterance.split()) for utterance in dataset['phonemized_utterance']]) - total_words
    percentage_child = 100 * len([c for c in dataset['is_child'] if c]) / total_utterances
    vowels, consonants, other = get_phoneme_set(dataset['phonemized_utterance'])
    n_phonemes = len(set(vowels + consonants + other))
    description = f"Taken from {num_corpora} corpora in the {collection} collection of CHILDES and phonemized using `{backend}` with language code `{lang_code}`."

    data['Language'].append(language)
    data['CHILDES Collection'].append(collection)
    data['Corpora'].append(num_corpora)
    data['Backend'].append(backend)
    data['Language Code'].append(lang_code)
    data['Inventory ID'].append(inventory_id)
    data['Description'].append(description)
    data['Speakers'].append(speakers)
    data['Utterances'].append(total_utterances)
    data['Words'].append(total_words)
    data['Phonemes'].append(total_phonemes)
    data['% Child'].append(percentage_child)
    data['Phoneme Types'].append(n_phonemes)
    data['Consonants'].append(len(consonants))
    data['Vowels'].append(len(vowels))


Polish...

Serbian...

Romanian...

PortugueseBr...

PortuguesePt...

Italian...

Catalan...

Quechua...

Norwegian...

Swedish...

Korean...

Welsh...

Irish...

Indonesian...

Icelandic...

Farsi...

Turkish...

Hungarian...

Basque...

Danish...

Croatian...

Estonian...

Cantonese...

Japanese...

Mandarin...

Dutch...

Spanish...

German...

French...

EnglishUK...
ɔːɹ not in phoible

English...


In [4]:
def create_readme_table(columns):
    text = " | ".join(columns) + "\n"
    text += "|:----" * len(columns) + "|"

    # Sort data by number of utterances
    languages = [x for _, x in sorted(zip(data['Phonemes'], data['Language']))][::-1]

    for language in languages:
        idx = data['Language'].index(language)
        text += "\n"
        for column in columns:
            if isinstance(data[column][idx], int):
                text += f"| {data[column][idx]:,}"
            elif isinstance(data[column][idx], float):
                text += f"| {data[column][idx]:.2f}"
            else:
                text += f"| {data[column][idx]}"
    return text

def create_latex_table(columns):
    text = "\\textbf{" + "} & \\textbf{".join(columns) + "} \\\\ \n\\midrule"
    # Sort data by number of utterances
    languages = [x for _, x in sorted(zip(data['Phonemes'], data['Language']))][::-1]

    for language in languages:
        idx = data['Language'].index(language)
        text += "\n"
        for i, column in enumerate(columns):
            if column == "Inventory ID":
                url = "https://phoible.org/inventories/view/" + str(data['Inventory ID'][idx])
                text += "\\href{" + url + "}{" + str(data[column][idx]) + "}"
            elif column == 'CHILDES Collection':
                url = "https://childes.talkbank.org/access/" + data['CHILDES Collection'][idx].split('/')[0].replace('lish', '-').lower()
                text += "\\href{" + url + "}{" + str(data[column][idx]) + "} (" + str(data['Corpora'][idx]) + ")"
            elif column == 'Backend':
                text += f"\\texttt{{{data[column][idx]}}}"
            elif isinstance(data[column][idx], int):
                text += f"{data[column][idx]:,}"
            elif isinstance(data[column][idx], float):
                text += f"{data[column][idx]:.2f}"
            else:
                text += f"{data[column][idx]}"
            if i < len(columns) - 1:
                text += " & "
        text += ' \\\\'
    text = text.replace('_', '\\_')
    text = text.replace('%', '\\%')
    return text

In [5]:
print(create_readme_table(['Language', 'Description', 'Speakers', 'Utterances', 'Words', 'Phonemes', '% Child']))

Language | Description | Speakers | Utterances | Words | Phonemes | % Child
|:----|:----|:----|:----|:----|:----|:----|
| EnglishNA| Taken from 49 corpora in the EnglishNA collection of CHILDES and phonemized using `phonemizer` with language code `en-us`.| 3,687| 2,564,614| 9,993,744| 30,986,218| 35.83
| EnglishUK| Taken from 16 corpora in the EnglishUK collection of CHILDES and phonemized using `phonemizer` with language code `en-gb`.| 869| 2,043,115| 7,147,541| 21,589,842| 39.00
| German| Taken from 10 corpora in the German collection of CHILDES and phonemized using `epitran` with language code `deu-Latn`.| 829| 1,525,559| 5,825,166| 21,442,576| 43.61
| Japanese| Taken from 11 corpora in the Japanese collection of CHILDES and phonemized using `phonemizer` with language code `ja`.| 489| 998,642| 2,970,674| 11,985,729| 44.20
| Indonesian| Taken from 1 corpora in the EastAsian/Indonesian collection of CHILDES and phonemized using `epitran` with language code `ind-Latn`.| 438| 813,795| 2

In [6]:
print(create_latex_table(['Language', 'CHILDES Collection', 'Backend', 'Inventory ID', 'Language Code',  'Words', 'Phonemes', '% Child']))

\textbf{Language} & \textbf{CHILDES Collection} & \textbf{Backend} & \textbf{Inventory ID} & \textbf{Language Code} & \textbf{Words} & \textbf{Phonemes} & \textbf{\% Child} \\ 
\midrule
EnglishNA & \href{https://childes.talkbank.org/access/eng-na}{EnglishNA} (49) & \texttt{phonemizer} & \href{https://phoible.org/inventories/view/2175}{2175} & en-us & 9,993,744 & 30,986,218 & 35.83 \\
EnglishUK & \href{https://childes.talkbank.org/access/eng-uk}{EnglishUK} (16) & \texttt{phonemizer} & \href{https://phoible.org/inventories/view/2252}{2252} & en-gb & 7,147,541 & 21,589,842 & 39.00 \\
German & \href{https://childes.talkbank.org/access/german}{German} (10) & \texttt{epitran} & \href{https://phoible.org/inventories/view/2398}{2398} & deu-Latn & 5,825,166 & 21,442,576 & 43.61 \\
Japanese & \href{https://childes.talkbank.org/access/japanese}{Japanese} (11) & \texttt{phonemizer} & \href{https://phoible.org/inventories/view/2196}{2196} & ja & 2,970,674 & 11,985,729 & 44.20 \\
Indonesian & \href{

In [7]:
# Sort data by number of utterances
languages = [x for _, x in sorted(zip(data['Phonemes'], data['Language']))][::-1]
data = {column: [data[column][data['Language'].index(language)] for language in languages] for column in data}

In [8]:
b = {language : [i * (1-c/100) for i,c in zip(data['Words'], data['% Child'])][j] for j, language in enumerate(languages)}

In [9]:
[language for language, i in b.items() if i > 180000]

['EnglishNA',
 'EnglishUK',
 'German',
 'Japanese',
 'Indonesian',
 'French',
 'Spanish',
 'Mandarin',
 'Dutch',
 'Polish',
 'Serbian',
 'Estonian',
 'Welsh',
 'Cantonese',
 'Swedish',
 'PortuguesePt',
 'Italian',
 'Croatian',
 'Catalan',
 'Icelandic']

In [10]:
# Get languages that have at least 700,000 tokens
b = {language : [i * (1-c/100) for i,c in zip(data['Phonemes'], data['% Child'])][j] for j, language in enumerate(languages)}
c = [language for language, i in b.items() if i > 18000000]
print(c, len(c))

['EnglishNA'] 1
