Ref:

https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb#scrollTo=LTXXutqeDzPi

In [None]:
!pip install transformers datasets

Collecting transformers
  Downloading transformers-4.13.0-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 5.0 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.16.1-py3-none-any.whl (298 kB)
[K     |████████████████████████████████| 298 kB 72.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 64.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 57.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 56.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K 

In [None]:
import os
from datasets import load_dataset, concatenate_datasets
from transformers import (
    BertTokenizer,
    BertTokenizerFast,
    BertConfig, 
    BertForMaskedLM, 
    Trainer, 
    TrainingArguments,
    LineByLineTextDataset,
    DataCollatorForLanguageModeling
)
import torch


In [None]:
# CONFIGS

RANDOM_SEED=37

DATASET_LIMIT = 300_000

MODEL_MAX_LEN = 512
NSP_DATESET_PATH = 'nsp.txt'
MLM_TRAIN_DATESET_PATH = 'mlm_train.txt'
MLM_TEST_DATESET_PATH = 'mlm_test.txt'
MLM_MASKING_PROB = .15

MLM_EPOCHS = 5

MODEL_NAME = "bert-base-uncased"
# MODEL_NAME = "bert-base-multilingual-uncased"

MODEL_SAVE_PATH = f"{MODEL_NAME.replace('-','_')}_wiki"

## Load Dataset

In [None]:
wiki = load_dataset("wikipedia", "20200501.en", split="train")
# bookcorpus = load_dataset("bookcorpus", split="train")
# print(wiki.column_names, bookcorpus.column_names)
# # ['title', 'text'] ['text']

# wiki.remove_columns_("title")
# bert_dataset = concatenate_datasets([wiki, bookcorpus])


# dataset = load_dataset("cc_news", split="train")

bert_dataset = wiki

Downloading:   0%|          | 0.00/4.24k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/6.87k [00:00<?, ?B/s]

Downloading and preparing dataset wikipedia/20200501.en (download: 16.99 GiB, generated: 17.07 GiB, post-processed: Unknown size, total: 34.06 GiB) to /root/.cache/huggingface/datasets/wikipedia/20200501.en/1.0.0/009f923d9b6dd00c00c8cdc7f408f2b47f45dd4f5fb7982a21f9448f4afbe475...


Downloading:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/18.3G [00:00<?, ?B/s]

Dataset wikipedia downloaded and prepared to /root/.cache/huggingface/datasets/wikipedia/20200501.en/1.0.0/009f923d9b6dd00c00c8cdc7f408f2b47f45dd4f5fb7982a21f9448f4afbe475. Subsequent calls will reuse this data.


In [None]:
bert_dataset

Dataset({
    features: ['title', 'text'],
    num_rows: 6078422
})

In [None]:
# split the dataset into training (90%) and testing (10%)
dataset_split = bert_dataset.train_test_split(test_size=0.1)

In [None]:
# if you want to train the tokenizer from scratch (especially if you have custom
# dataset loaded as datasets object), then run this cell to save it as files
# but if you already have your custom data as text files, there is no point using this

def dataset_to_text(dataset, output_filename="data.txt"):
    """Utility function to save dataset text to disk,
    useful for using the texts to train the tokenizer 
    (as the tokenizer accepts files)"""
    with open(output_filename, "w") as f:
        for t in dataset["text"]:
           print(t.replace('\n', ''), file=f)

In [None]:
# save the training set to train.txt
dataset_to_text(dataset_split["train"][:DATASET_LIMIT], MLM_TRAIN_DATESET_PATH)
dataset_to_text(dataset_split["test"][:DATASET_LIMIT//4], MLM_TEST_DATESET_PATH)

In [None]:
# with open(MLM_TRAIN_DATESET_PATH, 'r') as f:
#     text = f.read()
#     lines = text.split('\n')
#     print(lines[2])

## Tokenizer

In [None]:
# Save the slow pretrained tokenizer
# bert_tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

bert_tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME, max_len=512)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
bert_tokenizer.encode("This is a test.")

[101, 2023, 2003, 1037, 3231, 1012, 102]

## Build Dataset For Training

In [None]:
dataset = LineByLineTextDataset(
    tokenizer=bert_tokenizer,
    file_path=MLM_TRAIN_DATESET_PATH,
    block_size=128,
)



In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=bert_tokenizer, mlm=True, mlm_probability=MLM_MASKING_PROB
)

## Train

In [None]:
# Check that PyTorch sees it
torch.cuda.is_available()

True

In [None]:
config = BertConfig(
    num_hidden_layers=4, 
    num_attention_heads=4, 

)
model = BertForMaskedLM(config=config)

In [None]:
training_args = TrainingArguments(
    output_dir=MODEL_NAME,
    overwrite_output_dir=True,
    num_train_epochs=MLM_EPOCHS,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
%%time
trainer.train()

In [None]:
trainer.save_model('bert_uncased_based_wiki')

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./bert_uncased_based_wiki",
    tokenizer=bert_tokenizer
)

In [None]:
# The sun <mask>.
# =>

fill_mask("I want to go to the [MASK].")

[{'score': 0.026855848729610443,
  'sequence': 'i want to go to the name.',
  'token': 2171,
  'token_str': 'name'},
 {'score': 0.012670793570578098,
  'sequence': 'i want to go to the war.',
  'token': 2162,
  'token_str': 'war'},
 {'score': 0.010863966308534145,
  'sequence': 'i want to go to the time.',
  'token': 2051,
  'token_str': 'time'},
 {'score': 0.010574453510344028,
  'sequence': 'i want to go to the people.',
  'token': 2111,
  'token_str': 'people'},
 {'score': 0.008563429117202759,
  'sequence': 'i want to go to the family.',
  'token': 2155,
  'token_str': 'family'}]