The source notebook for this notebook can be found at: https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=EIS-irI0f32P

There is another tutorial for pre train a Bert, which I only went through briefly yet: https://d2l.ai/chapter_natural-language-processing-pretraining/bert-pretraining.html

In [1]:
!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt

--2020-06-18 01:15:23--  https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt
Resolving cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)... 13.249.87.37, 13.249.87.74, 13.249.87.7, ...
Connecting to cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)|13.249.87.37|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [2]:
# We won't need TensorFlow here

# Install `transformers` from master

# !pip install git+https://github.com/huggingface/transformers

!pip install transformers tokenizers
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.9.1
# tokenizers version at notebook update --- 0.7.0

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/local/anaconda/bin/python -m pip install --upgrade pip' command.[0m
tokenizers                         0.7.0    
transformers                       2.11.0   
You should consider upgrading via the '/usr/local/anaconda/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
%%time 
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

import os

if not os.path.exists('EsperBERTo'):
    os.makedirs('EsperBERTo')
    
    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    #TODO: Is this code runnable on GPU?
    paths = [str(x) for x in Path(".").glob("**/*.txt")]

    # Customize training
    tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])
    
    # TODO: Cannot get this code run with the latest version of tokenizer.
    tokenizer.save('EsperBERTo')



CPU times: user 2.7 ms, sys: 4.43 ms, total: 7.12 ms
Wall time: 5.78 ms


In [4]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    "./EsperBERTo/vocab.json",
    "./EsperBERTo/merges.txt",
)

In [5]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [6]:
# !pip install -U numpy

In [7]:
import torch

# Specify visible CUDA for the script, try to avoid encounder out of memory issue.
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1,2,3,4,5"

In [8]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [9]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("./EsperBERTo", model_max_length=512)

In [10]:
from transformers import RobertaForMaskedLM

# NOTE: If we want a PRE-TRAINED model, we may want to specify it in the config.
model = RobertaForMaskedLM(config=config)
print(f'The Bert model contains {model.num_parameters()} parameters.')

The Bert model contains 84095008 parameters.


In [11]:
%%time
from transformers import LineByLineTextDataset

# Define a dataset.
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./oscar.eo.txt",
    block_size=64,
)

# Define a data-collator. 
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

CPU times: user 8min 52s, sys: 36.3 s, total: 9min 29s
Wall time: 39.9 s


## Initialize the Trainer.

In [12]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./EsperBERTo",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)

"""
Trainer is a wrapper of: 
model, 
training hyperparameters,
data_collator,
dataset: TODO: what is the difference and connection between data_coolator and dataset?
prediction_loss_only
""" 
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

In [14]:
%%time
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=6091.0, style=ProgressStyle(description_w…



{"loss": 7.458342568397522, "learning_rate": 4.589558364800526e-05, "epoch": 0.08208832703989492, "step": 500}
{"loss": 6.836832889556884, "learning_rate": 4.179116729601051e-05, "epoch": 0.16417665407978985, "step": 1000}
{"loss": 6.644652894973754, "learning_rate": 3.768675094401576e-05, "epoch": 0.24626498111968478, "step": 1500}
{"loss": 6.499795407295227, "learning_rate": 3.358233459202101e-05, "epoch": 0.3283533081595797, "step": 2000}
{"loss": 6.344393545150757, "learning_rate": 2.947791824002627e-05, "epoch": 0.41044163519947463, "step": 2500}
{"loss": 6.118794407844543, "learning_rate": 2.5373501888031527e-05, "epoch": 0.49252996223936957, "step": 3000}
{"loss": 5.912473185539246, "learning_rate": 2.1269085536036776e-05, "epoch": 0.5746182892792645, "step": 3500}
{"loss": 5.7373830652236935, "learning_rate": 1.7164669184042028e-05, "epoch": 0.6567066163191594, "step": 4000}
{"loss": 5.6290211706161495, "learning_rate": 1.3060252832047284e-05, "epoch": 0.7387949433590544, "step

TrainOutput(global_step=6091, training_loss=6.128779335848668)

In [16]:
trainer.save_model("./EsperBERTo")

In [17]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./EsperBERTo",
    tokenizer="./EsperBERTo"
)

In [18]:
# The sun <mask>.
# =>

fill_mask("La suno <mask>.")

[{'sequence': '<s> La suno estas.</s>',
  'score': 0.0185850802809,
  'token': 316},
 {'sequence': '<s> La suno estis.</s>',
  'score': 0.014196408912539482,
  'token': 394},
 {'sequence': '<s> La suno de.</s>',
  'score': 0.01179225742816925,
  'token': 274},
 {'sequence': '<s> La suno urbo.</s>',
  'score': 0.004567708354443312,
  'token': 871},
 {'sequence': '<s> La suno tago.</s>',
  'score': 0.0030458723194897175,
  'token': 1633}]

In [19]:
fill_mask("Jen la komenco de bela <mask>.")

# This is the beginning of a beautiful <mask>.
# =>

[{'sequence': '<s> Jen la komenco de bela mondo.</s>',
  'score': 0.009763634763658047,
  'token': 945},
 {'sequence': '<s> Jen la komenco de bela lingvo.</s>',
  'score': 0.00913243368268013,
  'token': 697},
 {'sequence': '<s> Jen la komenco de bela vivo.</s>',
  'score': 0.006484068930149078,
  'token': 1160},
 {'sequence': '<s> Jen la komenco de bela Esperanto.</s>',
  'score': 0.00600389065220952,
  'token': 540},
 {'sequence': '<s> Jen la komenco de bela lando.</s>',
  'score': 0.005966768134385347,
  'token': 1076}]