In [1]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from torch.utils.data import random_split
from transformers import AutoModelForCausalLM, AutoTokenizer

In [20]:
# Load the data
data = pd.read_csv('careers_single.csv')
data.head()

Unnamed: 0,text
0,accounting technicians handle daytoday money a...
1,admin assistants give support to businesses by...
2,arts administrators help organise exhibitions ...
3,assistant immigration officers check that peop...
4,internal and external auditors check organisat...


In [21]:
# Initialize the tokenizer with special tokens
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [22]:
# Tokenize the dataset and convert it to a PyTorch dataset
text_dataset = TextDataset(tokenizer=tokenizer, file_path="careers_single.csv", block_size=128)



In [23]:
# Split the dataset into train and validation sets
train_size = int(0.9 * len(text_dataset))
val_size = len(text_dataset) - train_size
train_dataset, val_dataset = random_split(text_dataset, [train_size, val_size])

In [24]:
# Define the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False, 
    pad_to_multiple_of=8
)

In [25]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./", 
    overwrite_output_dir=True, 
    num_train_epochs=3, 
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,
    save_total_limit=1,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_steps=50
)

In [26]:
# Define the trainer
trainer = Trainer(
    model=model, 
    args=training_args, 
    data_collator=data_collator, 
    train_dataset=train_dataset, 
    eval_dataset=val_dataset
)

In [27]:
# Train the model
trainer.train()

***** Running training *****
  Num examples = 7578
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1422
  Number of trainable parameters = 124439808


  0%|          | 0/1422 [00:00<?, ?it/s]

{'loss': 4.3655, 'learning_rate': 1.9859353023909988e-05, 'epoch': 0.02}
{'loss': 3.9457, 'learning_rate': 1.9718706047819975e-05, 'epoch': 0.04}
{'loss': 3.6474, 'learning_rate': 1.957805907172996e-05, 'epoch': 0.06}
{'loss': 3.5178, 'learning_rate': 1.9437412095639944e-05, 'epoch': 0.08}
{'loss': 3.399, 'learning_rate': 1.929676511954993e-05, 'epoch': 0.11}
{'loss': 3.0913, 'learning_rate': 1.9156118143459917e-05, 'epoch': 0.13}
{'loss': 3.0143, 'learning_rate': 1.9015471167369904e-05, 'epoch': 0.15}
{'loss': 3.0163, 'learning_rate': 1.8874824191279887e-05, 'epoch': 0.17}
{'loss': 2.9691, 'learning_rate': 1.8734177215189874e-05, 'epoch': 0.19}
{'loss': 2.739, 'learning_rate': 1.859353023909986e-05, 'epoch': 0.21}
{'loss': 2.7906, 'learning_rate': 1.8452883263009847e-05, 'epoch': 0.23}
{'loss': 2.5987, 'learning_rate': 1.8312236286919833e-05, 'epoch': 0.25}
{'loss': 2.5854, 'learning_rate': 1.8171589310829816e-05, 'epoch': 0.27}
{'loss': 2.5603, 'learning_rate': 1.8030942334739806e-05

***** Running Evaluation *****
  Num examples = 842
  Batch size = 16


  0%|          | 0/53 [00:00<?, ?it/s]

{'eval_loss': 1.9909794330596924, 'eval_runtime': 102.5856, 'eval_samples_per_second': 8.208, 'eval_steps_per_second': 0.517, 'epoch': 1.0}
{'loss': 2.1445, 'learning_rate': 1.3248945147679326e-05, 'epoch': 1.01}
{'loss': 2.0676, 'learning_rate': 1.3108298171589311e-05, 'epoch': 1.03}


Saving model checkpoint to ./checkpoint-500
Configuration saved in ./checkpoint-500/config.json
Configuration saved in ./checkpoint-500/generation_config.json


{'loss': 2.0182, 'learning_rate': 1.2967651195499298e-05, 'epoch': 1.05}


Model weights saved in ./checkpoint-500/pytorch_model.bin
Deleting older checkpoint [checkpoint-1500] due to args.save_total_limit


{'loss': 2.147, 'learning_rate': 1.2827004219409284e-05, 'epoch': 1.08}
{'loss': 2.1259, 'learning_rate': 1.2686357243319269e-05, 'epoch': 1.1}
{'loss': 2.1352, 'learning_rate': 1.2545710267229257e-05, 'epoch': 1.12}
{'loss': 2.1779, 'learning_rate': 1.240506329113924e-05, 'epoch': 1.14}
{'loss': 2.1596, 'learning_rate': 1.2264416315049229e-05, 'epoch': 1.16}
{'loss': 2.0971, 'learning_rate': 1.2123769338959213e-05, 'epoch': 1.18}
{'loss': 2.0266, 'learning_rate': 1.19831223628692e-05, 'epoch': 1.2}
{'loss': 2.0923, 'learning_rate': 1.1842475386779185e-05, 'epoch': 1.22}
{'loss': 2.0299, 'learning_rate': 1.1701828410689171e-05, 'epoch': 1.24}
{'loss': 2.1248, 'learning_rate': 1.1561181434599158e-05, 'epoch': 1.27}
{'loss': 2.0295, 'learning_rate': 1.1420534458509143e-05, 'epoch': 1.29}
{'loss': 1.9701, 'learning_rate': 1.127988748241913e-05, 'epoch': 1.31}
{'loss': 1.9859, 'learning_rate': 1.1139240506329114e-05, 'epoch': 1.33}
{'loss': 2.1802, 'learning_rate': 1.09985935302391e-05, 'e

***** Running Evaluation *****
  Num examples = 842
  Batch size = 16


  0%|          | 0/53 [00:00<?, ?it/s]

{'eval_loss': 1.8485240936279297, 'eval_runtime': 69.3184, 'eval_samples_per_second': 12.147, 'eval_steps_per_second': 0.765, 'epoch': 2.0}
{'loss': 1.8902, 'learning_rate': 6.638537271448664e-06, 'epoch': 2.0}
{'loss': 2.039, 'learning_rate': 6.49789029535865e-06, 'epoch': 2.03}
{'loss': 1.8949, 'learning_rate': 6.3572433192686365e-06, 'epoch': 2.05}
{'loss': 1.839, 'learning_rate': 6.216596343178622e-06, 'epoch': 2.07}
{'loss': 1.9508, 'learning_rate': 6.075949367088608e-06, 'epoch': 2.09}


Saving model checkpoint to ./checkpoint-1000
Configuration saved in ./checkpoint-1000/config.json
Configuration saved in ./checkpoint-1000/generation_config.json


{'loss': 1.8642, 'learning_rate': 5.935302390998594e-06, 'epoch': 2.11}


Model weights saved in ./checkpoint-1000/pytorch_model.bin
Deleting older checkpoint [checkpoint-500] due to args.save_total_limit


{'loss': 1.9947, 'learning_rate': 5.79465541490858e-06, 'epoch': 2.13}
{'loss': 1.8101, 'learning_rate': 5.654008438818566e-06, 'epoch': 2.15}
{'loss': 1.9423, 'learning_rate': 5.5133614627285514e-06, 'epoch': 2.17}
{'loss': 2.0231, 'learning_rate': 5.372714486638537e-06, 'epoch': 2.19}
{'loss': 1.9954, 'learning_rate': 5.2320675105485245e-06, 'epoch': 2.22}
{'loss': 1.831, 'learning_rate': 5.09142053445851e-06, 'epoch': 2.24}
{'loss': 1.8362, 'learning_rate': 4.950773558368496e-06, 'epoch': 2.26}
{'loss': 2.0137, 'learning_rate': 4.8101265822784815e-06, 'epoch': 2.28}
{'loss': 1.8717, 'learning_rate': 4.669479606188467e-06, 'epoch': 2.3}
{'loss': 2.04, 'learning_rate': 4.528832630098453e-06, 'epoch': 2.32}
{'loss': 1.9845, 'learning_rate': 4.3881856540084394e-06, 'epoch': 2.34}
{'loss': 1.974, 'learning_rate': 4.247538677918425e-06, 'epoch': 2.36}
{'loss': 1.945, 'learning_rate': 4.106891701828411e-06, 'epoch': 2.38}
{'loss': 1.8957, 'learning_rate': 3.9662447257383965e-06, 'epoch': 2

***** Running Evaluation *****
  Num examples = 842
  Batch size = 16


  0%|          | 0/53 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 1.8132768869400024, 'eval_runtime': 69.5561, 'eval_samples_per_second': 12.105, 'eval_steps_per_second': 0.762, 'epoch': 3.0}
{'train_runtime': 7967.0373, 'train_samples_per_second': 2.854, 'train_steps_per_second': 0.178, 'train_loss': 2.168512526443739, 'epoch': 3.0}


TrainOutput(global_step=1422, training_loss=2.168512526443739, metrics={'train_runtime': 7967.0373, 'train_samples_per_second': 2.854, 'train_steps_per_second': 0.178, 'train_loss': 2.168512526443739, 'epoch': 3.0})

In [28]:
# Save the trained model
tokenizer.save_pretrained("fine-tuned-gpt2")
model.save_pretrained("fine-tuned-gpt2")

tokenizer config file saved in fine-tuned-gpt2/tokenizer_config.json
Special tokens file saved in fine-tuned-gpt2/special_tokens_map.json
Configuration saved in fine-tuned-gpt2/config.json
Configuration saved in fine-tuned-gpt2/generation_config.json
Model weights saved in fine-tuned-gpt2/pytorch_model.bin


In [7]:
# Load pre-trained and fine-tuned gpt2 models
pt_model_name = 'gpt2'
pt_model = AutoModelForCausalLM.from_pretrained(pt_model_name)
pt_tokenizer = AutoTokenizer.from_pretrained(pt_model_name)
ft_model_name = 'fine-tuned-gpt2'
ft_model = AutoModelForCausalLM.from_pretrained(ft_model_name)
ft_tokenizer = AutoTokenizer.from_pretrained(ft_model_name)

In [8]:
prompt = "My dad applied for the position of kindergarten teacher."

In [9]:
def generate_output(prompt, tokenizer, model):
    #Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    #Generate text
    generated_text = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=50,
        do_sample=True,
        top_p=0.95,
        top_k=50
    )
    output_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
    output = output_text.replace(prompt,'')
    return output

In [12]:
# prompt = "My dad applied for the position of kindergarten teacher."
prompts = [
    "My dad applied for the position of kindergarten teacher."
]
for prompt in prompts:
    print("Prompt:", prompt)
    pt_output = generate_output(prompt, pt_tokenizer, pt_model)
    print("Pre-trained GPT2 output:", pt_output)
    ft_output = generate_output(prompt, ft_tokenizer, ft_model)
    print("Fine-tuned GPT2 output:", ft_output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: My dad applied for the position of kindergarten teacher.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Pre-trained GPT2 output:  I loved my job, and my mom loved her. She was happy and content," she said.

"My dad didn't see anything wrong with me. He loved me. He just didn
Fine-tuned GPT2 output: 
youll need to register with the chartered institution of childcare psychologists
employers may ask you to go to a nursery and give a preneural induction if you show good verbal language skills

