In [1]:
TOKENIZER_BATCH_SIZE = 256  # Batch-size to train the tokenizer on
TOKENIZER_VOCABULARY = 25000  # Total number of unique subwords the tokenizer can have

BLOCK_SIZE = 128  # Maximum number of tokens in an input sample
NSP_PROB = 0.50  # Probability that the next sentence is the actual next sentence in NSP
SHORT_SEQ_PROB = 0.1  # Probability of generating shorter sequences to minimize the mismatch between pretraining and fine-tuning.
MAX_LENGTH = 512  # Maximum number of tokens in an input sample after padding

MLM_PROB = 0.2  # Probability with which tokens are masked in MLM

TRAIN_BATCH_SIZE = 2  # Batch-size for pretraining the model on
MAX_EPOCHS = 1  # Maximum number of epochs to train the model for
LEARNING_RATE = 1e-4  # Learning rate for training the model

MODEL_CHECKPOINT = "bert-base-cased"  # Name of pretrained model from 🤗 Model Hub

In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer(vocab_file="./tokenizer_wikitxt/vocab.txt")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformers import (
    CONFIG_MAPPING,MODEL_FOR_MASKED_LM_MAPPING, AutoConfig,
    BertForMaskedLM,
    AutoTokenizer,DataCollatorForLanguageModeling,HfArgumentParser,Trainer,TrainingArguments,set_seed,
)
config_kwargs = {
    "vocab_size" : 25000,
}
config = AutoConfig.from_pretrained("./config.json", **config_kwargs)

model = BertForMaskedLM(config)

In [4]:
from datasets import load_from_disk
dataset = load_from_disk(dataset_path="./raw-wikitxt/")
column_names = dataset["train"].column_names



In [5]:
def tokenize_function(examples):
    # remove empty lines
    examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
    return tokenizer(
        examples["text"],
        padding="max_length", # 填充
        truncation=True, # 截断
        max_length=128,
        return_special_tokens_mask=True,
    )
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=None,
    remove_columns="text",
    load_from_cache_file=True,
)

Loading cached processed dataset at c:\Users\Kevin\Desktop\code\pretrain_BERT_huggingface\raw-wikitxt\test\cache-27a0e9cfcdbcfbd5.arrow
Loading cached processed dataset at c:\Users\Kevin\Desktop\code\pretrain_BERT_huggingface\raw-wikitxt\train\cache-f0950ed5ff804cfa.arrow
Loading cached processed dataset at c:\Users\Kevin\Desktop\code\pretrain_BERT_huggingface\raw-wikitxt\validation\cache-583d8d680ff98a41.arrow


In [6]:
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["validation"]

In [7]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
# 训练参数
pretrain_batch_size=8
num_train_epochs=300
training_args = TrainingArguments(
    output_dir='./outputs/',
    overwrite_output_sdir=True,
    num_train_epochs=num_train_epochs,
    learning_rate=1e-4,
    per_device_train_batch_size=pretrain_batch_size,
    save_total_limit=10)
# 通过Trainer接口训练模型
trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset)

In [None]:
trainer.train(resume_from_checkpoint=False)

In [None]:
trainer.save_model("./outputs/")