In [2]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import Trainer, TrainingArguments
import pandas as pd
import sys
import gzip
from datetime import datetime

model_name = 'hfl/rbtl3'
train_file = './dureader/passage-collection/'
per_device_train_batch_size = 64

save_steps = 10000              #Save model every 1k steps
num_train_epochs = 3            #Number of epochs
use_fp16 = False                #Set to True, if your GPU supports FP16 operations
max_length = 256                #Max length for a text input
do_whole_word_mask = True       #If set to true, whole words are masked
mlm_prob = 0.15                 #Probability that a word is replaced by a [MASK] token

# Load the model
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

output_dir = "output/finetune_hlf-rbtl3-{}".format(datetime.now().strftime("%Y-%m-%d"))
print("Save checkpoints to:", output_dir)

Some weights of the model checkpoint at hfl/rbtl3 were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Save checkpoints to: output/finetune_hlf-rbtl3-2022-06-12


In [6]:
%%time
train_sentences = []
for part in ['part-00', 'part-01', 'part-02', 'part-03']:
    with open(train_file + part, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            data = line.rstrip().split('\t')
            train_sentences.append(data[2][:256])
    
print('train_sentences: ', len(train_sentences))

train_sentences:  8096668
CPU times: user 25.7 s, sys: 4.06 s, total: 29.7 s
Wall time: 29.7 s


In [7]:
from sklearn.model_selection import train_test_split

train_sentences, dev_sentences = train_test_split(train_sentences, test_size=0.01, random_state=13)

print("Train sentences:", len(train_sentences))
print("Dev sentences:", len(dev_sentences))

Train sentences: 8015701
Dev sentences: 80967


In [8]:
#A dataset wrapper, that tokenizes our data on-the-fly
class TokenizedSentencesDataset:
    def __init__(self, sentences, tokenizer, max_length, cache_tokenization=False):
        self.tokenizer = tokenizer
        self.sentences = sentences
        self.max_length = max_length
        self.cache_tokenization = cache_tokenization

    def __getitem__(self, item):
        if not self.cache_tokenization:
            return self.tokenizer(self.sentences[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True)

        if isinstance(self.sentences[item], str):
            self.sentences[item] = self.tokenizer(self.sentences[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True)
        return self.sentences[item]

    def __len__(self):
        return len(self.sentences)

train_dataset = TokenizedSentencesDataset(train_sentences, tokenizer, max_length)
dev_dataset = TokenizedSentencesDataset(dev_sentences, tokenizer, max_length, cache_tokenization=True) if len(dev_sentences) > 0 else None

In [9]:
if do_whole_word_mask:
    data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)
else:
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    evaluation_strategy="steps" if dev_dataset is not None else "no",
    per_device_train_batch_size=per_device_train_batch_size,
    eval_steps=save_steps,
    save_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=1,
    prediction_loss_only=True,
    fp16=use_fp16
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
)

print("Save tokenizer to:", output_dir)
tokenizer.save_pretrained(output_dir)

trainer.train()

print("Save model to:", output_dir)
model.save_pretrained(output_dir)

print("Training done")

tokenizer config file saved in output/finetune_hlf-rbtl3-2022-06-12/tokenizer_config.json
Special tokens file saved in output/finetune_hlf-rbtl3-2022-06-12/special_tokens_map.json
***** Running training *****
  Num examples = 8015701
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 375738


Save tokenizer to: output/finetune_hlf-rbtl3-2022-06-12


The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 