# Training a BERT-based Phonetic Model

In [None]:
# !pip install datasets
# !pip install tokenizers
# !pip install transformers

## Preprocess Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('processed_word_list.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,original_text,clean_text,sampa,pos,ipa
0,0,Allan,Allan,aj\an,PROPN,aʝan
1,1,Aloès,Aloès,aloes,PROPN,aloès
2,2,Alpes,Alpes,alpes,PROPN,alpes
3,3,Alpes-Pirineos,AlpesPirineos,alpespi4ineos,PROPN,alpespiɾineos
4,4,Alsacia,Alsacia,alsasia,PROPN,alsasia


In [None]:
def ngramify_entry(text, n=2):
    return (' ').join([text[i:i+n] for i in range(len(text)-n+1)])

In [None]:
### This is the tokenization format that we're trying. 

df['sampa_bigram'] = [ngramify_entry(w, 2) for w in list(df['sampa'])]
df['text'] = df['sampa_bigram'] + ' ' + df['pos']
df['text'] = df['text'].str.split()
df['text'] = [(' ').join(x[::-1]) for x in df['text']]
dff = pd.DataFrame(df['text'])
dff.to_csv('sampa_bigrams_pos.csv')

In [None]:
dff['text']

0                                PROPN an \a j\ aj
1                                PROPN es oe lo al
2                                PROPN es pe lp al
3        PROPN os eo ne in 4i i4 pi sp es pe lp al
4                          PROPN ia si as sa ls al
                           ...                    
19995               ADV te nt en me am ka ik ni un
19996                              ADJ ko ik ni un
19997                           ADJ os ko ik ni un
19998                          NOUN os 4o e4 te ut
19999                           ADJ es le il ti ut
Name: text, Length: 20000, dtype: object

## Train Tokenizer

In [None]:
### Hugging Face Dataset 

from datasets import load_dataset
dataset = load_dataset('csv', data_files='sampa_bigrams_pos.csv')
dataset = dataset['train'].remove_columns('Unnamed: 0')

In [None]:
dataset[:10]

{'text': ['PROPN an \\a j\\ aj',
  'PROPN es oe lo al',
  'PROPN es pe lp al',
  'PROPN os eo ne in 4i i4 pi sp es pe lp al',
  'PROPN ia si as sa ls al',
  'PROPN om to st ls al',
  'PROPN e4 ne en te lt al',
  'PROPN 4o e4 je tj lt al',
  'PROPN es 4e a4 ba lb al',
  'PROPN e4 me im ei se ls al']}

In [None]:
batch_size = 1000

def batch_iterator():
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["text"]

In [None]:
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

#tokenizer = Tokenizer(models.BPE(unl_token="[UNK]"))
tokenizer = Tokenizer(models.WordPiece(unl_token="[UNK]"))

In [None]:
tokenizer.normalizer = normalizers.BertNormalizer(clean_text = False, strip_accents = False, lowercase = False )
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [None]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=12000, min_frequency=0, special_tokens=special_tokens)

In [None]:
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)

In [None]:
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id, sep_token_id)

2 3


In [None]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", cls_token_id),
        ("[SEP]", sep_token_id),
    ],
)

In [None]:
print(len(tokenizer.get_vocab()))
list(tokenizer.get_vocab().items())[:10]

527


[('iw', 452),
 ('lo', 155),
 ('4p', 306),
 ('##d', 74),
 ('4l', 175),
 ('##N', 54),
 ('dg', 470),
 ('4a', 94),
 ('ls', 322),
 ('im', 157)]

In [None]:
from huggingface_hub import notebook_login

In [None]:
#notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [None]:
from transformers import BertTokenizerFast

new_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)

In [None]:
new_tokenizer.push_to_hub("deep-ritmo-sampa-tokenizer", use_temp_dir=True)

CommitInfo(commit_url='https://huggingface.co/caffsean/deep-ritmo-sampa-tokenizer/commit/52b9f443f93e13a6e121f8b60e0b10f5f2d7c08b', commit_message='Upload tokenizer', commit_description='', oid='52b9f443f93e13a6e121f8b60e0b10f5f2d7c08b', pr_url=None, pr_revision=None, pr_num=None)

## Pretrain BERT Base

In [None]:
from transformers import AutoTokenizer

tokenizer_checkpoint = 'caffsean/deep-ritmo-sampa-tokenizer'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)

Downloading:   0%|          | 0.00/247 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [None]:
from sklearn.model_selection import train_test_split

dff = pd.read_csv('sampa_bigrams_pos.csv')
train_df, test_df = train_test_split(dff['text'])

train_df.to_csv('sampa_bigrams_pos_train.csv')
test_df.to_csv('sampa_bigrams_pos_test.csv')

In [None]:
dataset = load_dataset("csv", data_files={'train':'sampa_bigrams_pos_train.csv','validation':'sampa_bigrams_pos_test.csv'})
dataset



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-146a2bf39229a72b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-146a2bf39229a72b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'text'],
        num_rows: 5000
    })
})

In [None]:
dataset['train'][:10]

{'Unnamed: 0': [17664,
  13616,
  10165,
  15277,
  16508,
  11282,
  8475,
  719,
  19895,
  12290],
 'text': ['NOUN jo 4j a4 la al sa',
  'VERB i4 ti nt en me',
  'ADJ ba ib ti ut lu ol bo eb',
  'PROPN ma 4m o4 fo af ta at la pl',
  'VERB an ja tj at ba eb 4e',
  'NOUN te nt an wa gw',
  'NOUN as da ad ta lt ul ku ik fi if di',
  'PROPN en ne in ei le sl os lo ul ou po op ko ak 4a t4 it mi im di',
  'PROPN os mo am ba',
  'ADV te nt en me am ja 4j a4 sa es se es ne nn in']}

In [None]:

tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(['Unnamed: 0','text'])

In [None]:
from transformers import Trainer, TrainingArguments
from transformers import AutoConfig, AutoModelForMaskedLM

model_checkpoint = "bert-base-cased"

config = AutoConfig.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_config(config)

training_args = TrainingArguments(
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
    push_to_hub_model_id=f"{model_checkpoint}-deep-ritmo-sampa",
)

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 15000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5625


Epoch,Training Loss,Validation Loss
1,3.4042,3.060982
2,2.8648,2.629833
3,2.6572,2.554962


Saving model checkpoint to test-clm/checkpoint-500
Configuration saved in test-clm/checkpoint-500/config.json
Model weights saved in test-clm/checkpoint-500/pytorch_model.bin
Several commits (2) will be pushed upstream.
Saving model checkpoint to test-clm/checkpoint-1000
Configuration saved in test-clm/checkpoint-1000/config.json
Model weights saved in test-clm/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test-clm/checkpoint-1500
Configuration saved in test-clm/checkpoint-1500/config.json
Model weights saved in test-clm/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 5000
  Batch size = 8
Saving model checkpoint to test-clm/checkpoint-2000
Configuration saved in test-clm/checkpoint-2000/config.json
Model weights saved in test-clm/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to test-clm/checkpoint-2500
Configuration saved in test-clm/checkpoint-2500/config.json
Model weights saved in test-clm/checkpoint-2500/pytorch_model.bi

TrainOutput(global_step=5625, training_loss=3.041832280815972, metrics={'train_runtime': 686.042, 'train_samples_per_second': 65.594, 'train_steps_per_second': 8.199, 'total_flos': 416485410184896.0, 'train_loss': 3.041832280815972, 'epoch': 3.0})

In [None]:
trainer.push_to_hub()

Saving model checkpoint to test-clm
Configuration saved in test-clm/config.json
Model weights saved in test-clm/pytorch_model.bin
Several commits (3) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.34k/413M [00:00<?, ?B/s]

Upload file runs/Oct28_05-25-29_147412fcac43/events.out.tfevents.1666934917.147412fcac43.76.0:  32%|###2      …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/caffsean/bert-base-cased-deep-ritmo-sampa
   8eee22d..cb8dab1  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/caffsean/bert-base-cased-deep-ritmo-sampa
   8eee22d..cb8dab1  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Masked Language Modeling', 'type': 'fill-mask'}}
To https://huggingface.co/caffsean/bert-base-cased-deep-ritmo-sampa
   cb8dab1..0d6ae2a  main -> main

   cb8dab1..0d6ae2a  main -> main



'https://huggingface.co/caffsean/bert-base-cased-deep-ritmo-sampa/commit/cb8dab147c5bb805a7efd7faaed9324f87d8f80d'