In [1]:
%load_ext autoreload
%autoreload 2

# Train Roberta masked language model + tokenizer

Using the names from 220, train a Roberta masked language model and tokenizer

- train tokenizer first (ByteLevelBPETokenizer)
  - this does not have to be the same tokenizer we use when training the bi-encoder, but it could be
- then train a Roberta masked language model

In [28]:
import json
import os
import random

from datasets import load_dataset
import pandas as pd
import torch
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import RobertaTokenizerFast, RobertaForMaskedLM, RobertaConfig, \
                         DataCollatorForLanguageModeling, \
                         Trainer, TrainingArguments, \
                         pipeline

from nama.data.filesystem import download_file_from_s3, upload_file_to_s3

In [27]:
#config
# TODO do for given and surname
given_surname = "given"
# given_surname = "surname"

name_sample_size = 10_000_000
cross_encoder_vocab_size = 265
tokenizer_min_frequency = 2
tokenizer_max_length = 32
roberta_attention_heads = 12
roberta_hidden_layers = 6

frequencies_path = f"s3://fs-nama-data/2024/familysearch-names/interim/tree-hr-{given_surname}-aggr-v2.parquet"

roberta_dir = f"../data/models/roberta-{given_surname}-{cross_encoder_vocab_size}"
# names path is the list of names used to train roberta
names_path = os.path.join(roberta_dir, "names_sample.txt")
roberta_dir_s3 = f"s3://fs-nama-data/2024/nama-data/data/models/roberta-{given_surname}-{cross_encoder_vocab_size}/"

In [4]:
if not os.path.exists(roberta_dir):
    os.makedirs(roberta_dir) 

In [5]:
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

True
cuda total 8141471744
cuda reserved 0
cuda allocated 0


## Save name sample

In [9]:
path = download_file_from_s3(frequencies_path) if frequencies_path.startswith("s3://") else frequencies_path
counts_df = pd.read_parquet(path)
print(counts_df.shape)

(25541154, 10)


In [10]:
counts_df = counts_df[['alt_name', 'total_alt_name_frequency']].drop_duplicates()
print(counts_df.shape)

In [12]:
counts_df['total_alt_name_frequency'].sum()

np.int64(2906726951)

In [14]:
%%time
# get list of all names (shuffle takes a long time)
all_names = []
for row in counts_df.to_dict('records'):
    for _ in range(row['total_alt_name_frequency'] // 2):
        all_names.append(row['alt_name'])
random.shuffle(all_names)    
print(len(all_names))

1451132858
CPU times: user 20min 10s, sys: 13.9 s, total: 20min 24s
Wall time: 20min 27s


In [16]:
all_names = all_names[:name_sample_size]
print(len(all_names))

10000000


In [18]:
with open(names_path, 'w', encoding='utf-8') as f:
    for name in all_names:
        f.write(name + '\n')

## Train tokenizer

In [6]:
%%time 

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=[names_path], 
                vocab_size=cross_encoder_vocab_size, 
                min_frequency=tokenizer_min_frequency, 
                special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])




CPU times: user 1min 2s, sys: 16.3 s, total: 1min 18s
Wall time: 7.58 s


### Save tokenizer

In [7]:
tokenizer.save_model(roberta_dir)

# need to manually create a json config file for the pipeline function to work
# copying https://zablo.net/blog/post/training-roberta-from-scratch-the-missing-guide-polish-language-model/
tokenizer_config = {
    "architectures": ["RobertaForMaskedLM"], 
    "max_position_embeddings": tokenizer_max_length+2, 
    "vocab_size": cross_encoder_vocab_size,
}
with open(os.path.join(roberta_dir, "config.json"), "w") as f:
    json.dump(tokenizer_config, f)

### Test tokenizer

In [8]:
def load_tokenizer(roberta_dir, tokenizer_max_length):
    tokenizer = ByteLevelBPETokenizer(
        os.path.join(roberta_dir, "vocab.json"),
        os.path.join(roberta_dir, "merges.txt"),
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_truncation(max_length=tokenizer_max_length) 
    return tokenizer

In [9]:
# load tokenizer
tokenizer = load_tokenizer(roberta_dir, tokenizer_max_length)

In [10]:
tokenizer.encode("richard")

Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [11]:
tokenizer.encode("richard").tokens

['<s>', 'r', 'i', 'c', 'h', 'ar', 'd', '</s>']

## Train Roberta Model

In [12]:
tokenizer = RobertaTokenizerFast.from_pretrained(roberta_dir, max_len=tokenizer_max_length)

In [13]:
tokenizer.encode("richard")

[0, 86, 77, 71, 76, 261, 72, 2]

In [14]:
config = RobertaConfig(
    vocab_size=cross_encoder_vocab_size,
    max_position_embeddings=tokenizer_max_length+2,
    num_attention_heads=roberta_attention_heads,
    num_hidden_layers=roberta_hidden_layers,
    type_vocab_size=1,
)

In [15]:
model = RobertaForMaskedLM(config=config)

In [16]:
model.num_parameters()

43351561

### Create dataset and collator

In [17]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

dataset = load_dataset("text", data_files=[names_path]).map(tokenize_function, batched=True)["train"]

In [18]:
len(dataset)

10000000

In [19]:
dataset[0]

{'text': 'emma',
 'input_ids': [0, 73, 81, 81, 69, 2],
 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [20]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15, 
)

### Train model

In [21]:
training_args = TrainingArguments(
    output_dir=roberta_dir,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=len(dataset) / 20,
    save_total_limit=20,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [22]:
%%time
trainer.train()

Step,Training Loss
500,2.3568
1000,1.5406
1500,1.2972
2000,1.1787
2500,1.085
3000,1.0321
3500,0.9789
4000,0.9441
4500,0.9314
5000,0.9135


CPU times: user 2h 19min 53s, sys: 26.2 s, total: 2h 20min 19s
Wall time: 2h 19min 43s


TrainOutput(global_step=156250, training_loss=0.5308848104980469, metrics={'train_runtime': 8382.9129, 'train_samples_per_second': 1192.903, 'train_steps_per_second': 18.639, 'total_flos': 2.9281782455170944e+16, 'train_loss': 0.5308848104980469, 'epoch': 1.0})

### Save model

In [23]:
trainer.save_model(roberta_dir)

### Upload model to S3

In [31]:
for filename in [
    'vocab.json',
    'merges.txt',
    'config.json',
    'training_args.bin',
    'model.safetensors',
    'names_sample.txt',
]:
    print(filename)
    upload_file_to_s3(os.path.join(roberta_dir, filename), roberta_dir_s3+filename)

vocab.json
merges.txt
config.json
training_args.bin
model.safetensors
names_sample.txt


### Test model

In [24]:
fill_mask = pipeline(
    "fill-mask",
    model=roberta_dir,
    tokenizer=roberta_dir
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [25]:
fill_mask("mari<mask>")

[{'score': 0.6878960728645325,
  'token': 69,
  'token_str': 'a',
  'sequence': 'maria'},
 {'score': 0.2919888496398926,
  'token': 73,
  'token_str': 'e',
  'sequence': 'marie'},
 {'score': 0.015600468032062054,
  'token': 262,
  'token_str': 'an',
  'sequence': 'marian'},
 {'score': 0.002008459297940135,
  'token': 83,
  'token_str': 'o',
  'sequence': 'mario'},
 {'score': 0.0005247094668447971,
  'token': 82,
  'token_str': 'n',
  'sequence': 'marin'}]