In [30]:
from datasets import load_dataset

In [31]:
# Load a dataset from the HuggingFace Hub
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

In [32]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [33]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# Tokenize the dataset
def tokenize_function(examples):
    # Add padding and truncation as needed
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

In [15]:
train_test_split = tokenized_dataset['train'].train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [16]:
from transformers import DataCollatorForLanguageModeling

# Data Collator with masking for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15 # 15% of Tokens will be masked
    )

In [17]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
    )




In [18]:
trainer.train()

Step,Training Loss
500,1.788
1000,1.84
1500,1.8638
2000,1.857
2500,1.8859
3000,1.8729
3500,1.8838
4000,1.8653
4500,1.8132
5000,1.7367


TrainOutput(global_step=12393, training_loss=1.7410674583418437, metrics={'train_runtime': 26285.4848, 'train_samples_per_second': 3.772, 'train_steps_per_second': 0.471, 'total_flos': 6523399682841600.0, 'train_loss': 1.7410674583418437, 'epoch': 3.0})

In [29]:
# Evaluate
eval_results = trainer.evaluate()

# Save the model
trainer.save_model('./bert-fill-mask')

# Also need to save the tokenizer
tokenizer.save_pretrained('./bert-fill-mask')


('./bert-fill-mask/tokenizer_config.json',
 './bert-fill-mask/special_tokens_map.json',
 './bert-fill-mask/vocab.txt',
 './bert-fill-mask/added_tokens.json',
 './bert-fill-mask/tokenizer.json')

In [28]:
from transformers import pipeline

fill_mask = pipeline('fill-mask', model='./bert-fill-mask')
result = fill_mask("The capital of France is [MASK].")
print(result)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'score': 0.25977739691734314, 'token': 3000, 'token_str': 'paris', 'sequence': 'the capital of france is paris.'}, {'score': 0.1738542914390564, 'token': 27160, 'token_str': 'toulon', 'sequence': 'the capital of france is toulon.'}, {'score': 0.11356217414140701, 'token': 22479, 'token_str': 'lille', 'sequence': 'the capital of france is lille.'}, {'score': 0.04045763984322548, 'token': 29025, 'token_str': 'pau', 'sequence': 'the capital of france is pau.'}, {'score': 0.03243820369243622, 'token': 16766, 'token_str': 'marseille', 'sequence': 'the capital of france is marseille.'}]
