In [None]:
%load_ext autoreload
%autoreload 2

# Train Roberta masked language model + tokenizer

Using the names from 220, train a Roberta masked language model and tokenizer

- train tokenizer first (ByteLevelBPETokenizer)
  - this does not have to be the same tokenizer we use when training the bi-encoder, but it could be
- then train a Roberta masked language model

In [None]:
import json
import os

from datasets import load_dataset, Dataset
import torch
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import RobertaTokenizerFast, RobertaForMaskedLM, RobertaConfig, \
                         PreTrainedTokenizer, DataCollatorForLanguageModeling, \
                         Trainer, TrainingArguments, \
                         pipeline
from tqdm import tqdm

In [None]:
given_surname = 'given'

tokenizer_vocab_size = 265
tokenizer_min_frequency = 2
tokenizer_max_length = 32
roberta_attention_heads = 12
roberta_hidden_layers = 6

name_paths = [
#     f"../data/processed/all-tree-preferred-{given_surname}-sample-1m.txt",
    f"../data/processed/all-tree-hr-{given_surname}-sample-10m.txt",
#     f"../data/processed/all-tree-preferred-{given_surname}.txt",
#     f"../data/processed/all-tree-hr-{given_surname}.txt",
]

roberta_dir = f"../data/models/roberta-{given_surname}-10m-{tokenizer_vocab_size}"

In [None]:
if not os.path.exists(roberta_dir):
    os.makedirs(roberta_dir) 

In [None]:
torch.cuda.empty_cache()
print(torch.cuda.is_available())
print("cuda total", torch.cuda.get_device_properties(0).total_memory)
print("cuda reserved", torch.cuda.memory_reserved(0))
print("cuda allocated", torch.cuda.memory_allocated(0))

## Train tokenizer

In [None]:
%%time 

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=name_paths, 
                vocab_size=tokenizer_vocab_size, 
                min_frequency=tokenizer_min_frequency, 
                special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

### Save tokenizer

In [None]:
tokenizer.save_model(roberta_dir)

# need to manually create a json config file for the pipeline function to work
# copying https://zablo.net/blog/post/training-roberta-from-scratch-the-missing-guide-polish-language-model/
tokenizer_config = {
    "architectures": ["RobertaForMaskedLM"], 
    "max_position_embeddings": tokenizer_max_length+2, 
    "vocab_size": tokenizer_vocab_size,
}
with open(os.path.join(roberta_dir, "config.json"), "w") as f:
    json.dump(tokenizer_config, f)

### Test tokenizer

In [None]:
def load_tokenizer(roberta_dir, tokenizer_max_length):
    tokenizer = ByteLevelBPETokenizer(
        os.path.join(roberta_dir, "vocab.json"),
        os.path.join(roberta_dir, "merges.txt"),
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_truncation(max_length=tokenizer_max_length) 
    return tokenizer

In [None]:
# load tokenizer
tokenizer = load_tokenizer(roberta_dir, tokenizer_max_length)

In [None]:
tokenizer.encode("richard")

In [None]:
tokenizer.encode("richard").tokens

## Train Roberta Model

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained(roberta_dir, max_len=tokenizer_max_length)

In [None]:
tokenizer.encode("richard")

In [None]:
config = RobertaConfig(
    vocab_size=tokenizer_vocab_size,
    max_position_embeddings=tokenizer_max_length+2,
    num_attention_heads=roberta_attention_heads,
    num_hidden_layers=roberta_hidden_layers,
    type_vocab_size=1,
)

In [None]:
model = RobertaForMaskedLM(config=config)

In [None]:
model.num_parameters()

### Create dataset and collator

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

dataset = load_dataset("text", data_files=name_paths).map(tokenize_function, batched=True)["train"]

In [None]:
len(dataset)

In [None]:
dataset[0]

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15, 
)

### Train model

In [None]:
training_args = TrainingArguments(
    output_dir=roberta_dir,
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=len(dataset) / 20,
    save_total_limit=20,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
%%time
trainer.train()

### Save model

In [None]:
trainer.save_model(roberta_dir)

### Test model

In [None]:
fill_mask = pipeline(
    "fill-mask",
    model=roberta_dir,
    tokenizer=roberta_dir
)

In [None]:
fill_mask("mari<mask>")