In [1]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments,TextDataset, DataCollatorForLanguageModeling
from torch.utils.data import random_split

In [2]:
# Load the CSV file into a Pandas dataframe
df = pd.read_csv('careers_single.csv')

In [3]:
# Initialize the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [4]:
# Tokenize the dataset and convert it to a PyTorch dataset
text_dataset = TextDataset(tokenizer=tokenizer, file_path="careers_single.csv", block_size=128)



In [5]:
# Split the dataset into train and validation sets
train_size = int(0.9 * len(text_dataset))
val_size = len(text_dataset) - train_size
train_dataset, val_dataset = random_split(text_dataset, [train_size, val_size])

In [6]:
# Define the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False, 
    pad_to_multiple_of=8
)

In [7]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./", 
    overwrite_output_dir=True, 
    num_train_epochs=3, 
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,
    save_total_limit=1,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_steps=50
)

In [8]:
# Define the trainer
trainer = Trainer(
    model=model, 
    args=training_args, 
    data_collator=data_collator, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset
)

In [9]:
# Train the model
trainer.train()

***** Running training *****
  Num examples = 8048
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1509
  Number of trainable parameters = 222903552


  0%|          | 0/1509 [00:00<?, ?it/s]

{'loss': 2.9148, 'learning_rate': 1.9867461895294898e-05, 'epoch': 0.02}
{'loss': 1.9312, 'learning_rate': 1.9734923790589798e-05, 'epoch': 0.04}
{'loss': 0.753, 'learning_rate': 1.960238568588469e-05, 'epoch': 0.06}
{'loss': 0.247, 'learning_rate': 1.946984758117959e-05, 'epoch': 0.08}
{'loss': 0.1668, 'learning_rate': 1.9337309476474488e-05, 'epoch': 0.1}
{'loss': 0.1206, 'learning_rate': 1.9204771371769388e-05, 'epoch': 0.12}
{'loss': 0.0917, 'learning_rate': 1.907223326706428e-05, 'epoch': 0.14}
{'loss': 0.0746, 'learning_rate': 1.893969516235918e-05, 'epoch': 0.16}
{'loss': 0.0536, 'learning_rate': 1.8807157057654078e-05, 'epoch': 0.18}
{'loss': 0.0419, 'learning_rate': 1.8674618952948974e-05, 'epoch': 0.2}
{'loss': 0.0399, 'learning_rate': 1.854208084824387e-05, 'epoch': 0.22}
{'loss': 0.0412, 'learning_rate': 1.8409542743538767e-05, 'epoch': 0.24}
{'loss': 0.0352, 'learning_rate': 1.8277004638833667e-05, 'epoch': 0.26}
{'loss': 0.0311, 'learning_rate': 1.8144466534128564e-05, 'e

Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Configuration saved in ./checkpoint-500/generation_config.json


{'loss': 0.0077, 'learning_rate': 1.3373094764744865e-05, 'epoch': 0.99}


Model weights saved in ./checkpoint-500/pytorch_model.bin
Deleting older checkpoint [checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 895
  Batch size = 16


  0%|          | 0/56 [00:00<?, ?it/s]

{'eval_loss': 0.0021900897845625877, 'eval_runtime': 98.3567, 'eval_samples_per_second': 9.1, 'eval_steps_per_second': 0.569, 'epoch': 1.0}
{'loss': 0.0123, 'learning_rate': 1.3240556660039763e-05, 'epoch': 1.01}
{'loss': 0.0065, 'learning_rate': 1.3108018555334661e-05, 'epoch': 1.03}
{'loss': 0.0058, 'learning_rate': 1.2975480450629556e-05, 'epoch': 1.05}
{'loss': 0.0125, 'learning_rate': 1.2842942345924454e-05, 'epoch': 1.07}
{'loss': 0.0086, 'learning_rate': 1.2710404241219351e-05, 'epoch': 1.09}
{'loss': 0.0073, 'learning_rate': 1.257786613651425e-05, 'epoch': 1.11}
{'loss': 0.009, 'learning_rate': 1.2445328031809146e-05, 'epoch': 1.13}
{'loss': 0.0112, 'learning_rate': 1.2312789927104042e-05, 'epoch': 1.15}
{'loss': 0.0066, 'learning_rate': 1.218025182239894e-05, 'epoch': 1.17}
{'loss': 0.0095, 'learning_rate': 1.2047713717693839e-05, 'epoch': 1.19}
{'loss': 0.0068, 'learning_rate': 1.1915175612988734e-05, 'epoch': 1.21}
{'loss': 0.0083, 'learning_rate': 1.1782637508283632e-05, 'e

Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Configuration saved in ./checkpoint-1000/generation_config.json


{'loss': 0.0056, 'learning_rate': 6.7461895294897285e-06, 'epoch': 1.99}


Model weights saved in ./checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 895
  Batch size = 16


  0%|          | 0/56 [00:00<?, ?it/s]

{'eval_loss': 0.001569723361171782, 'eval_runtime': 97.2229, 'eval_samples_per_second': 9.206, 'eval_steps_per_second': 0.576, 'epoch': 2.0}
{'loss': 0.0045, 'learning_rate': 6.613651424784626e-06, 'epoch': 2.01}
{'loss': 0.0049, 'learning_rate': 6.4811133200795225e-06, 'epoch': 2.03}
{'loss': 0.0041, 'learning_rate': 6.348575215374421e-06, 'epoch': 2.05}
{'loss': 0.0038, 'learning_rate': 6.216037110669318e-06, 'epoch': 2.07}
{'loss': 0.0045, 'learning_rate': 6.083499005964215e-06, 'epoch': 2.09}
{'loss': 0.0059, 'learning_rate': 5.950960901259113e-06, 'epoch': 2.11}
{'loss': 0.0057, 'learning_rate': 5.81842279655401e-06, 'epoch': 2.13}
{'loss': 0.007, 'learning_rate': 5.685884691848907e-06, 'epoch': 2.15}
{'loss': 0.007, 'learning_rate': 5.5533465871438045e-06, 'epoch': 2.17}
{'loss': 0.0061, 'learning_rate': 5.420808482438702e-06, 'epoch': 2.19}
{'loss': 0.0049, 'learning_rate': 5.2882703777335986e-06, 'epoch': 2.21}
{'loss': 0.005, 'learning_rate': 5.155732273028497e-06, 'epoch': 2.

Saving model checkpoint to ./checkpoint-1500
Configuration saved in ./checkpoint-1500/config.json
Configuration saved in ./checkpoint-1500/generation_config.json


{'loss': 0.0063, 'learning_rate': 1.1928429423459245e-07, 'epoch': 2.98}


Model weights saved in ./checkpoint-1500/pytorch_model.bin
Deleting older checkpoint [checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 895
  Batch size = 16


  0%|          | 0/56 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.0013568727299571037, 'eval_runtime': 91.8732, 'eval_samples_per_second': 9.742, 'eval_steps_per_second': 0.61, 'epoch': 3.0}
{'train_runtime': 14997.2764, 'train_samples_per_second': 1.61, 'train_steps_per_second': 0.101, 'train_loss': 0.051731910956486556, 'epoch': 3.0}


TrainOutput(global_step=1509, training_loss=0.051731910956486556, metrics={'train_runtime': 14997.2764, 'train_samples_per_second': 1.61, 'train_steps_per_second': 0.101, 'train_loss': 0.051731910956486556, 'epoch': 3.0})

In [10]:
# Save the fine-tuned model
model.save_pretrained('fine-tuned-t5-base')
tokenizer.save_pretrained('fine-tuned-t5-base')

Configuration saved in fine-tuned-t5-base/config.json
Configuration saved in fine-tuned-t5-base/generation_config.json
Model weights saved in fine-tuned-t5-base/pytorch_model.bin
tokenizer config file saved in fine-tuned-t5-base/tokenizer_config.json
Special tokens file saved in fine-tuned-t5-base/special_tokens_map.json


('fine-tuned-t5-base/tokenizer_config.json',
 'fine-tuned-t5-base/special_tokens_map.json',
 'fine-tuned-t5-base/spiece.model',
 'fine-tuned-t5-base/added_tokens.json')

In [2]:
# Load the pre-trained and fine-tuned T5 models
pt_model_name = "t5-base"
pt_tokenizer = T5Tokenizer.from_pretrained(pt_model_name)
pt_model = T5ForConditionalGeneration.from_pretrained(pt_model_name)
ft_model_name = "fine-tuned-t5-base"
ft_tokenizer = T5Tokenizer.from_pretrained(ft_model_name)
ft_model = T5ForConditionalGeneration.from_pretrained(ft_model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [62]:
# Define the input prompt
prompt = "My dad applied for the position of kindergarten teacher."

In [3]:
#Generate Output
def generate_output(prompt, model, tokenizer):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    # Generate text completion
    output_ids = model.generate(input_ids, max_length=100, do_sample=True, temperature=1.5, top_k=50,repetition_penalty=2.0, top_p=0.95)
    # Decode the generated tokens back into text
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return output_text

In [7]:
# prompt = "My dad applied for the position of kindergarten teacher."
prompts = [
    "My dad applied for the position of kindergarten teacher."
]
for prompt in prompts:
    print("Prompt:", prompt)
    pt_output = generate_output(prompt, pt_model, pt_tokenizer)
    print("Pre-trained T5 output:", pt_output)
    ft_output = generate_output(prompt, ft_model, ft_tokenizer)
    print("Fine-tuned T5 output:", ft_output)

Prompt: My dad applied for the position of kindergarten teacher.
Pre-trained T5 output: On December 25, I turned 14. Many thanks for your post.
Fine-tuned T5 output: (My dad applied for the position of kindergarten teacher.) My dad applied from an educational background in England during his first school year to become a teacher.
