# BERT model pretraining using Hugging Face Transformers

## Dataset: Wikitext & Bookcorpus

In [2]:
from datasets import concatenate_datasets, load_dataset

bookcorpus = load_dataset("bookcorpus", split="train")
wiki = load_dataset("wikipedia", "20220301.en", split="train")
# wikipedia dataset only retain 'text' column
wiki = wiki.remove_columns([col for col in wiki.column_names if col != "text"])

dataset = concatenate_datasets([bookcorpus, wiki])

In [3]:
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 80462898
})


In [13]:
# save dataset to text
def dataset_to_text(dataset, output_filename="data.txt"):
    with open(output_filename, "w") as f:
        for t in dataset['text']:
            print(t, file=f)

# store dataset to local disk
dataset_to_text(dataset, "data/data.txt")

In [4]:
# Split into train and test sets
train_test_split = dataset.train_test_split(test_size=0.2)  # 80% train, 20% test

# Access the train and test sets
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Print dataset info
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")


Train dataset size: 64370318
Test dataset size: 16092580


In [5]:
vocab_size = 30_522
max_length = 512  # max len of input sequence
truncate_longer_samples = False  # truncate longer samples to max_length

In [6]:
print(train_dataset)

Dataset({
    features: ['text'],
    num_rows: 64370318
})


## Train tokenizer

In [19]:
from tokenizers import BertWordPieceTokenizer
import os
import json

special_tokens = [
    "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
]
files = ["data/bert/data.txt"]  # or ["train.txt", "test.txt"]

tokenizer = BertWordPieceTokenizer()

# train tokenizer
tokenizer.train(files=files, vocab_size=vocab_size, special_tokens=special_tokens)

tokenizer.enable_truncation(max_length=max_length)

# save tokenizer
model_path = "bert"
if not os.path.exists(model_path):
    os.mkdir(model_path)
tokenizer.save_model(model_path)
# save config
with open(os.path.join(model_path, "config.json"), "w") as f:
    tokenizer_cfg = {
        "do_lower_case": True,
        "unk_token": "[UNK]",
        "sep_token": "[SEP]",
        "pad_token": "[PAD]",
        "cls_token": "[CLS]",
        "mask_token": "[MASK]",
        "max_len": max_length,
        "model_max_length": max_length
    }
    json.dump(tokenizer_cfg, f)







## Load tokenizer

In [14]:
from transformers import BertTokenizerFast
"""
BertTokenizerFast is a fast version of the BertTokenizer that uses the Hugging Face tokenizers library (written in Rust) to provide much faster tokenization 
speeds compared to the regular Python-based BertTokenizer. The BertTokenizerFast is often preferred when working with large datasets due to its increased efficiency.
"""
model_path = "pretrained-bert"
# load tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_path)

## Preprocessing dataset


In [15]:
print("truncate_longer_samples:", truncate_longer_samples)

def encode_with_truncation(examples):
    """Use tokenizer to encode examples (with truncation)"""
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=max_length, return_special_tokens_mask=True)

def encode_without_truncation(examples):
    """Use tokenizer to encode examples (no truncation)"""
    return tokenizer(examples["text"], return_special_tokens_mask=True)  # Note：no padding; sentences vary in length
    

encode = encode_with_truncation if truncate_longer_samples else encode_without_truncation


print("Encoding training data...")

train_dataset_encoded = train_dataset.map(encode, batched=True)
test_dataset_encoded = test_dataset.map(encode, batched=True)

if truncate_longer_samples:
    train_dataset_encoded.set_format(type="torch", columns=["input_ids", "attention_mask"])
    test_dataset_encoded.set_format(type="torch", columns=["input_ids", "attention_mask"])
else:
    train_dataset_encoded.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])

truncate_longer_samples: False
Encoding training data...


Map: 100%|██████████| 5166936/5166936 [35:25<00:00, 2430.53 examples/s]
Map: 100%|██████████| 1291734/1291734 [08:38<00:00, 2491.37 examples/s]


In [17]:
print(train_dataset_encoded)

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 5166936
})


In [None]:
from itertools import chain

def group_texts(examples):
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    result = {
        k: [t[i: i + max_length] for i in range(0, total_length, max_length)] for k, t in concatenated_examples.items()
    }
    return result

if not truncate_longer_samples:
    train_dataset_encoded = train_dataset_encoded.map(group_texts, batched=True, desc=f'Grouping texts in chunks of size {max_length}')
    test_dataset_encoded = test_dataset_encoded.map(group_texts, batched=True, desc=f'Grouping texts in chunks of size {max_length}')

# 将他们从转为tensor
train_dataset_encoded.set_format(type='torch')
train_dataset_encoded.set_format(type='torch')

## Train BERT from scratch

In [None]:
from transformers import BertConfig, BertForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer

model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(model_config)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.2)

training_args = TrainingArguments(
    output_dir=model_path,
    evaluation_strategy="steps",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=8,
    per_device_eval_batch_size=32,
    logging_steps=1000,
    save_steps=1000,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=3,
    #load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset_encoded,
    eval_dataset=test_dataset_encoded
)


trainer.train()

## Invoke trained BERT model

In [None]:
from transformers import pipeline

model = BertForMaskedLM.from_pretrained(model_path, "checkpoint-10000")
tokenizer = BertTokenizerFast.from_pretrained(model_path)
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)


examples = [
    "Today's most trending hashtages on [MASK] is Donald Trump",
    "The [MASK] was cloudy yesterday, but today it's sunny!"
]

for example in examples:
    for prediction in fill_mask(example):
        print(f"{prediction['sequence']}, confidence: {prediction['score']:.3f}")
    print("=" * 50)