# Training the CHILDES Tokenizer

Using the phonemes in our CHILDES dataset, we train a tokenizer that just splits according to whitespace.

In [1]:
import pandas as pd
import re

from datasets import load_dataset, get_dataset_config_names
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders
from transformers import GPT2TokenizerFast

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
phoible = pd.read_csv('../../../data/phoible.csv')
phoible_phonemes = phoible.Phoneme.unique()

  phoible = pd.read_csv('../../../data/phoible.csv')


In [3]:
MIN_COUNT = 10
STRESS_RE = re.compile(r"[ˈˌ'-]+")

def build_vocabulary(datasets, column='phonemized_utterance', allow_non_phoible=False, allow_stressed_tokens=False):

    vocab = {'UNK' : 0, 'PAD' : 1, 'WORD_BOUNDARY' : 2, 'UTT_BOUNDARY' : 3}
    unk_tokens = []
    token_counts = {}
    for dataset in datasets:
        for line in dataset[column]:
            tokens = line.strip().split()
            for token in tokens:
                if token not in token_counts:
                    token_counts[token] = 0
                token_counts[token] += 1
        
    # Add tokens to vocab if they are not in phoible and have a count greater than MIN_COUNT
    for token, count in token_counts.items():
        if count > MIN_COUNT and token not in vocab:
            if token not in phoible_phonemes and not allow_non_phoible:
                if allow_stressed_tokens and STRESS_RE.findall(token):
                    vocab[token] = len(vocab)
                else:
                    unk_tokens.append(token)
            else:
                vocab[token] = len(vocab)

    print('Tokens not found in phoible: ', {token: token_counts[token] for token in unk_tokens})
    print('Vocab: ', vocab)
    print('Vocab size: ', len(vocab))
    return vocab

def build_phoneme_tokenizer(vocab, add_stress_replacer=False):

    # We replace any kind of stress marker with a single primary stress marker
    norms = []
    if add_stress_replacer:
        new_vocab = {}
        for token in vocab:
            if STRESS_RE.findall(token):
                new_token = "ˈ" + STRESS_RE.sub('', token)
                if token != new_token:
                    norms.append(normalizers.Replace(token, new_token))
                token = new_token
            if token not in new_vocab:
                new_vocab[token] = len(new_vocab)
        vocab = new_vocab
        print('Using only primary stress markers...')
        print('New vocab: ', vocab)
        print('New vocab size: ', len(vocab))
    norms.append(normalizers.Strip())

    tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token='UNK'))
    # tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(' WORD_BOUNDARY', ''), normalizers.Strip()]) 
    tokenizer.normalizer = normalizers.Sequence(norms) 
    tokenizer.add_special_tokens(["UNK", "PAD", "UTT_BOUNDARY", "WORD_BOUNDARY"])
    tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
    tokenizer.post_processor = processors.TemplateProcessing(
        single="UTT_BOUNDARY $A",
        pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
        special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
    )

    wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', pad_token='PAD', unk_token='UNK')
    return wrapped_tokenizer


# Training a Tokenizer for each language in CHILDES

We create a unique tokenizer for each language, to keep the vocabulary size appropriate for each language. For most languages we remove any tokens not found in Phoible. We do not do this for Mandarin or Cantonese as for these languages we merge the tone marker and preceding vowel into one phoneme, whereas Phoible treats tone markers as independent symbols.

In [4]:
languages = get_dataset_config_names('phonemetransformers/CHILDES-stress')
#languages = ['Mandarin', 'Cantonese']
print('Languages:', languages)
datasets = {language : load_dataset('phonemetransformers/CHILDES-stress', language, split='train') for language in languages}

Downloading readme: 100%|██████████| 435/435 [00:00<00:00, 1.35MB/s]


Languages: ['English', 'French', 'Dutch']


Downloading data: 100%|██████████| 885M/885M [00:26<00:00, 33.9MB/s] 
Generating train split: 2564614 examples [00:12, 210110.14 examples/s]
Downloading data: 100%|██████████| 198M/198M [00:05<00:00, 38.0MB/s] 
Generating train split: 721121 examples [00:03, 232560.48 examples/s]
Downloading data: 100%|██████████| 133M/133M [00:03<00:00, 35.8MB/s] 
Generating train split: 403472 examples [00:01, 217400.59 examples/s]


In [5]:
for language, dataset in datasets.items():
    print(f'\nTraining tokenizer for {language}...')
    allow_non_phoible = language in ['Mandarin', 'Cantonese'] # For Mandarin and Cantonese, allow non-phoible tokens since we merge tone with vowels
    vocab = build_vocabulary([dataset], allow_non_phoible=allow_non_phoible, allow_stressed_tokens=True)
    tokenizer = build_phoneme_tokenizer(vocab, add_stress_replacer=True)
    tokenizer.push_to_hub(f"phonemetransformers/CHILDES-{language}-phoneme-tokenizer-stress")
    print(f'Tokenizer for {language} pushed to the hub.')

# print(f'\nTrainking tokenizer for all languages...')
# vocab = build_vocabulary(datasets.values())
# tokenizer = build_phoneme_tokenizer(vocab)
# tokenizer.push_to_hub("phonemetransformers/CHILDES-phoneme-tokenizer")
# print('Done.')



Training tokenizer for English...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'd̠ʒ': 4, 'ˈʌ': 5, 's': 6, 't': 7, 'l': 8, 'ˈaɪ': 9, 'k': 10, 'j': 11, 'ʊ': 12, 'ɹ': 13, 'b': 14, 'ˈʊ': 15, 'æ': 16, 'h': 17, 'ˈoʊ': 18, 'm': 19, 'ˈiː': 20, 'ð': 21, 'ɛ': 22, 'z': 23, 'ʌ': 24, 'f': 25, 'ˈeɪ': 26, 'w': 27, 'ɪ': 28, 'ɡ': 29, 'ˈæ': 30, 'ˈɑ': 31, 'ə': 32, 'p': 33, 'ˈuː': 34, 'ˈɛ': 35, 'i': 36, 'ˌuː': 37, 'ɑ': 38, 'θ': 39, 'ˈɪ': 40, 'ŋ': 41, 'iː': 42, 'uː': 43, 'ɔ': 44, 'aɪ': 45, 'ˈɔɪ': 46, 'n': 47, 'd': 48, 'ˈɔ': 49, 'ˈaʊ': 50, 'v': 51, 'ˈɜː': 52, 'ˌʌ': 53, 't̠ʃ': 54, 'ˌɔ': 55, 'oʊ': 56, 'ˌoʊ': 57, 'ˌʊ': 58, 'ˌeɪ': 59, 'ʃ': 60, 'ˌɛ': 61, 'ɜː': 62, 'ˌɑ': 63, 'ˌaʊ': 64, 'ˌaɪ': 65, 'ˌə': 66, 'ˌiː': 67, 'ˌɪ': 68, 'eɪ': 69, 'iə': 70, 'ˈiə': 71, 'ˌæ': 72, 'ˌɜː': 73, 'aʊ': 74, 'ˌɔɪ': 75, 'ɔɪ': 76, 'ˌiə': 77, 'ʒ': 78, 'ˈə': 79, 'x': 80}
Vocab size:  81
Using only primary stress markers...
New vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_B

No files have been modified since last commit. Skipping to prevent empty commit.


Tokenizer for French pushed to the hub.

Training tokenizer for Dutch...
Tokens not found in phoible:  {}
Vocab:  {'UNK': 0, 'PAD': 1, 'WORD_BOUNDARY': 2, 'UTT_BOUNDARY': 3, 'z': 4, 'ˈoː': 5, 'j': 6, 'ˈãː': 7, 'ɦ': 8, 'ɾ': 9, 'd': 10, 'i': 11, 'ɛ': 12, 'p': 13, 'ɪ': 14, 'k': 15, 'ˈɑ': 16, 'l': 17, 'ˈɛː': 18, 'n': 19, 's': 20, 'v': 21, 'ə': 22, 'ˈɛi': 23, 'ʋ': 24, 'ˈɛ': 25, 't': 26, 'm': 27, 'ɣ': 28, 'ˈʏ': 29, 'ãː': 30, 'oː': 31, 'ˈɔ': 32, 'x': 33, 'ɑ': 34, 'ˈu': 35, 'f': 36, 'ŋ': 37, 'ˈøː': 38, 'ɔ': 39, 'ˈi': 40, 'b': 41, 'ˌãː': 42, 'ɔː': 43, 'ˈɪ': 44, 'ˌi': 45, 'ɛː': 46, 'ˈʌu': 47, 'ɛi': 48, 'ˌɛi': 49, 'ˈy': 50, 'ˌʌu': 51, 'ʏ': 52, 'ˈœy': 53, 'tʲ': 54, 'ˌɛ': 55, 'ˌɑ': 56, 'ʌu': 57, 'u': 58, 'ˌʏ': 59, 'ˈɔː': 60, 'œy': 61, 'ˌɛː': 62, 'w': 63, 'ˌu': 64, 'y': 65, 'ˌɪ': 66, 'ˌoː': 67, 'ˌə': 68, 'ˌøː': 69, 'ˌɔ': 70, 'ʃ': 71, 'ˈə': 72, 't̠ʃ': 73, 'ɲ': 74, 'ˌy': 75, 'ʒ': 76, 'ˌœy': 77, 'ˌɔː': 78, 'ˈiː': 79, 'ɡ': 80, 'øː': 81, 'd̠ʒ': 82, 'ã': 83}
Vocab size:  84
Using only primary stress m

In [6]:
from transformers import AutoTokenizer

def check_tokenizer(tokenizer):
    # It turns out that the Whitespace normalizer does not include tone symbols, so for the Cantonese 
    # and Mandarin tokenizers, it was splitting phonemes like 'a˥' in two, and so converting them to two UNK
    # tokens. This is fixed by using WhitespaceSplit normalizer, which works like split().
    is_ok = True
    for v, x in tokenizer.vocab.items():
        if not (tokenizer.encode(v)[1:] == [x]):
            #print(f'Tokenizer failed to encode "{v}", gave {tokenizer.encode(v)[1:]}')
            is_ok = False
    return is_ok

for language in datasets.keys():
    t = AutoTokenizer.from_pretrained(f'phonemetransformers/CHILDES-{language}-phoneme-tokenizer')
    is_ok = check_tokenizer(t)
    print(f'{language} tokenizer is ok: {is_ok}')

English tokenizer is ok: True
French tokenizer is ok: True
Dutch tokenizer is ok: True


# BPE Tokenizers for CHILDES

In [7]:
dataset = load_dataset('phonemetransformers/CHILDES', 'English', split='train')
tokenizer = Tokenizer(models.BPE())
tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(),
         normalizers.Lowercase(),
         normalizers.Strip(),
         normalizers.StripAccents(),
        ]
    )
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)

trainer = trainers.BpeTrainer(vocab_size=8192, special_tokens=["UTT_BOUNDARY", "PAD", "UNK"])
tokenizer.train_from_iterator(dataset['processed_gloss'], trainer=trainer)

tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
    single="UTT_BOUNDARY $A",
    pair="UTT_BOUNDARY $A UTT_BOUNDARY $B:1",
    special_tokens=[("UTT_BOUNDARY", tokenizer.token_to_id("UTT_BOUNDARY"))],
)






In [8]:
example = dataset['processed_gloss'][300]
encoding = tokenizer.encode(example)
print(f'Example: {example}')
print(encoding.tokens)

Example: is that what you saw?
['UTT_BOUNDARY', 'Ġis', 'Ġthat', 'Ġwhat', 'Ġyou', 'Ġsaw', '?']


In [9]:
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer, pad_token='PAD', unk_token='UNK', bos_token='UTT_BOUNDARY', eos_token='UTT_BOUNDARY', add_prefix_space=True)
wrapped_tokenizer.push_to_hub("phonemetransformers/CHILDES-English-BPE-gloss-tokenizer")

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/phonemetransformers/CHILDES-English-BPE-gloss-tokenizer/commit/dc70201e9f3dc609aea522ae4df6cc435f07a55e', commit_message='Upload tokenizer', commit_description='', oid='dc70201e9f3dc609aea522ae4df6cc435f07a55e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/phonemetransformers/CHILDES-English-BPE-gloss-tokenizer', endpoint='https://huggingface.co', repo_type='model', repo_id='phonemetransformers/CHILDES-English-BPE-gloss-tokenizer'), pr_revision=None, pr_num=None)

In [10]:
tokenized = wrapped_tokenizer(example, padding='max_length', max_length=20, truncation=True, add_special_tokens=True)
tokenized

{'input_ids': [0, 115, 92, 95, 67, 781, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [11]:
wrapped_tokenizer.convert_ids_to_tokens(tokenized['input_ids'])

['UTT_BOUNDARY',
 'Ġis',
 'Ġthat',
 'Ġwhat',
 'Ġyou',
 'Ġsaw',
 '?',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD',
 'PAD']

In [12]:
wrapped_tokenizer('this is a test .')

{'input_ids': [0, 124, 115, 61, 3630, 45, 6], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}