In [1]:
import pandas as pd
from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from transformers import BartTokenizer, BartForConditionalGeneration

In [18]:
# Load the data
data = pd.read_csv('bart_masked.csv')
data.head()

Unnamed: 0,text
0,accounting technicians <mask> daytoday money a...
1,admin assistants <mask> support to <mask> <mas...
2,arts administrators help organise exhibitions ...
3,assistant immigration officers check <mask> pe...
4,internal and external auditors check organisat...


In [19]:
# Initialize the tokenizer and model
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

loading configuration file config.json from cache at /Users/stefanrodrigues/.cache/huggingface/hub/models--facebook--bart-large/snapshots/cb48c1365bd826bd521f650dc2e0940aee54720c/config.json
Model config BartConfig {
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "i

In [21]:
# Tokenize the dataset and convert it to a PyTorch dataset
text_dataset = TextDataset(tokenizer=tokenizer, file_path="bart_masked.csv", block_size=128)

Creating features from dataset file at 
Saving features into cached file cached_lm_BartTokenizer_126_bart_masked.csv [took 0.018 s]


In [5]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [6]:
training_args = TrainingArguments(
    output_dir="./", 
    overwrite_output_dir=True, 
    num_train_epochs=3, 
    per_device_train_batch_size=16, 
    save_total_limit=1
)

In [7]:
trainer = Trainer(
    model=model, 
    args=training_args, 
    data_collator=data_collator, 
    train_dataset=text_dataset
)

In [8]:
# Train the model from scratch
trainer.train()

***** Running training *****
  Num examples = 8441
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1584
  Number of trainable parameters = 406291456


  0%|          | 0/1584 [00:00<?, ?it/s]

Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Configuration saved in ./checkpoint-500/generation_config.json


{'loss': 4.2179, 'learning_rate': 3.421717171717172e-05, 'epoch': 0.95}


Model weights saved in ./checkpoint-500/pytorch_model.bin
Deleting older checkpoint [checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Configuration saved in ./checkpoint-1000/generation_config.json


{'loss': 2.8123, 'learning_rate': 1.8434343434343433e-05, 'epoch': 1.89}


Model weights saved in ./checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [checkpoint-500] due to args.save_total_limit
Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500/config.json
Configuration saved in ./checkpoint-1500/generation_config.json


{'loss': 2.2651, 'learning_rate': 2.651515151515152e-06, 'epoch': 2.84}


Model weights saved in ./checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [checkpoint-1000] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 29624.5436, 'train_samples_per_second': 0.855, 'train_steps_per_second': 0.053, 'train_loss': 3.0464016789137713, 'epoch': 3.0}


TrainOutput(global_step=1584, training_loss=3.0464016789137713, metrics={'train_runtime': 29624.5436, 'train_samples_per_second': 0.855, 'train_steps_per_second': 0.053, 'train_loss': 3.0464016789137713, 'epoch': 3.0})

In [9]:
# Save the trained model
tokenizer.save_pretrained("fine-tuned-bart")
model.save_pretrained("fine-tuned-bart")

tokenizer config file saved in fine-tuned-bart/tokenizer_config.json
Special tokens file saved in fine-tuned-bart/special_tokens_map.json
Configuration saved in fine-tuned-bart/config.json
Configuration saved in fine-tuned-bart/generation_config.json
Model weights saved in fine-tuned-bart/pytorch_model.bin


In [21]:
prompt = "My <mask> applied for the position of childcare worker."

In [2]:
# Generate text using the loaded model and tokenizer
def predict_token(prompt, model, tokenizer):
    batch = tokenizer(prompt, return_tensors="pt")
    generated_ids = model.generate(batch["input_ids"])
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return generated_text

In [6]:
#load the pre-trained and fine-tuned bart models
pt_model_name = 'facebook/bart-large'
pt_model = BartForConditionalGeneration.from_pretrained(pt_model_name, forced_bos_token_id=0)
pt_tokenizer = BartTokenizer.from_pretrained(pt_model_name)
ft_model_name = 'fine-tuned-bart'
ft_model = BartForConditionalGeneration.from_pretrained(ft_model_name, forced_bos_token_id=0)
ft_tokenizer = BartTokenizer.from_pretrained(ft_model_name)
prompts = [
    "<mask> hair looks great today.",
    "He never leaves <mask> phone at home.",
    "<mask> is very attractive.",
    "Paul is passionate about <mask> work as a teacher.",
    "Paul decided to dye <mask> hair a new color."
]
for prompt in prompts:
    print("\nPrompt:", prompt)
    pt_output = predict_token(prompt, pt_model,pt_tokenizer)
    ft_output = predict_token(prompt, ft_model,ft_tokenizer)
    print("Pre-trained BART:", pt_output)
    print("Fine-tuned BART:", ft_output)



Prompt: <mask> hair looks great today.
Pre-trained BART: My hair looks great today. I love it.
Fine-tuned BART:  your hair looks great today today.your hair looks your great today.

Prompt: He never leaves <mask> phone at home.
Pre-trained BART: He never leaves his cell phone at home.
Fine-tuned BART:  he never leaves his phone at home home.He never leaves the house without a phone

Prompt: <mask> is very attractive.
Pre-trained BART: The stock is very attractive. Buy!
Fine-tuned BART:  the job is very attractive attractive. the salary is very acceptable.

Prompt: Paul is passionate about <mask> work as a teacher.
Pre-trained BART: Paul is passionate about education and his work as a teacher.
Fine-tuned BART: Paul is passionate about teaching and could work as a teacher as teacher teacher.Paul is

Prompt: Paul decided to dye <mask> hair a new color.
Pre-trained BART: Paul decided to dye his hair a new color.
Fine-tuned BART: Paul decided to dye his own hair a new color.Paul to dye you