In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForMaskedLM, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [4]:
data = pd.read_csv('careers_masked.csv')
data.head()

Unnamed: 0,text
0,accounting technicians handle daytoday money a...
1,admin assistants give support to [MASK] by org...
2,arts [MASK] help organise exhibitions manage s...
3,assistant immigration officers check that peop...
4,internal and external auditors check organisat...


In [5]:
# Initialize the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
# Tokenize the dataset and convert it to a PyTorch dataset
tokenized_dataset = TextDataset(tokenizer=tokenizer, file_path="careers_masked.csv", block_size=128)



In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15
)

In [8]:
training_args = TrainingArguments(
    output_dir="./", 
    overwrite_output_dir=True, 
    num_train_epochs=3, 
    per_device_train_batch_size=16, 
    save_total_limit=1
)

In [9]:
trainer = Trainer(
    model=model, 
    args=training_args, 
    data_collator=data_collator, 
    train_dataset=tokenized_dataset
)

In [10]:
# Train the model from scratch
trainer.train()

***** Running training *****
  Num examples = 9237
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1734
  Number of trainable parameters = 109514298


  0%|          | 0/1734 [00:00<?, ?it/s]

Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Configuration saved in ./checkpoint-500/generation_config.json


{'loss': 1.6899, 'learning_rate': 3.558246828143022e-05, 'epoch': 0.87}


Model weights saved in ./checkpoint-500/pytorch_model.bin
Deleting older checkpoint [checkpoint-1500] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Configuration saved in ./checkpoint-1000/generation_config.json


{'loss': 1.2685, 'learning_rate': 2.116493656286044e-05, 'epoch': 1.73}


Model weights saved in ./checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [checkpoint-500] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500/config.json
Configuration saved in ./checkpoint-1500/generation_config.json


{'loss': 1.1422, 'learning_rate': 6.747404844290659e-06, 'epoch': 2.6}


Model weights saved in ./checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [checkpoint-1000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 8316.7222, 'train_samples_per_second': 3.332, 'train_steps_per_second': 0.208, 'train_loss': 1.3279785279446683, 'epoch': 3.0}


TrainOutput(global_step=1734, training_loss=1.3279785279446683, metrics={'train_runtime': 8316.7222, 'train_samples_per_second': 3.332, 'train_steps_per_second': 0.208, 'train_loss': 1.3279785279446683, 'epoch': 3.0})

In [11]:
# Save the trained model
tokenizer.save_pretrained("fine-tuned-bert")
model.save_pretrained("fine-tuned-bert")

tokenizer config file saved in fine-tuned-bert/tokenizer_config.json
Special tokens file saved in fine-tuned-bert/special_tokens_map.json
Configuration saved in fine-tuned-bert/config.json
Configuration saved in fine-tuned-bert/generation_config.json
Model weights saved in fine-tuned-bert/pytorch_model.bin


In [6]:
#Function to predict the masked token
def predict_token(prompt, model, tokenizer):
    # Tokenize prompt and get mask indices
    tokenized_prompt = tokenizer(prompt, return_tensors='pt')
    mask_indices = torch.where(tokenized_prompt['input_ids'] == tokenizer.mask_token_id)[1]
    # Generate text with masked words filled in
    with torch.no_grad():
        output = model(**tokenized_prompt)
        logits = output.logits
        mask_logits = logits[0, mask_indices, :]
        mask_probabilities = torch.softmax(mask_logits, dim=-1)
        predicted_tokens = torch.argmax(mask_probabilities, dim=-1)
        predicted_words = tokenizer.convert_ids_to_tokens(predicted_tokens)
        filled_prompt = prompt.replace("[MASK]", predicted_words[0])
        return filled_prompt

In [14]:
# Load pre-trained and fine-tuned model and tokenizer and predciting tokens
prompts = [
    "[MASK] hair looks great today.",
    "He never leaves [MASK] phone at home.",
    "[MASK] is very attractive.",
    "Paul is passionate about [MASK] work as a teacher.",
    "Paul decided to dye [MASK] hair a new color."
]
pt_model_name = 'bert-base-uncased'
pt_model = BertForMaskedLM.from_pretrained(pt_model_name)
pt_tokenizer = BertTokenizer.from_pretrained(pt_model_name)
ft_model_name = 'fine-tuned-bert'
ft_model = BertForMaskedLM.from_pretrained(ft_model_name)
ft_tokenizer = BertTokenizer.from_pretrained(ft_model_name)
for prompt in prompts:
    print("\nPrompt:", prompt)
    pt_output = predict_token(prompt, pt_model,pt_tokenizer)
    ft_output = predict_token(prompt, ft_model,ft_tokenizer)
    print("Pre-trained BERT:", pt_output)
    print("Fine-tuned BERT:", ft_output)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



Prompt: [MASK] hair looks great today.
Pre-trained BERT: her hair looks great today.
Fine-tuned BERT: your hair looks great today.

Prompt: He never leaves [MASK] phone at home.
Pre-trained BERT: He never leaves his phone at home.
Fine-tuned BERT: He never leaves the phone at home.

Prompt: [MASK] is very attractive.
Pre-trained BERT: she is very attractive.
Fine-tuned BERT: it is very attractive.

Prompt: Paul is passionate about [MASK] work as a teacher.
Pre-trained BERT: Paul is passionate about his work as a teacher.
Fine-tuned BERT: Paul is passionate about finding work as a teacher.

Prompt: Paul decided to dye [MASK] hair a new color.
Pre-trained BERT: Paul decided to dye his hair a new color.
Fine-tuned BERT: Paul decided to dye her hair a new color.
