In [13]:
#!pip install transformers

# Finetuning GPT on Simpsons Episodes

This is an exercise in using Huggingface to finetune GPT.

In [2]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [4]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Using pad_token, but it is not set yet.


In [5]:
simpsons_data = TextDataset(
    tokenizer=tokenizer,
    file_path='simpsons.txt',  # Simpson episode summaries
    block_size=32  # length of each chunk of text to use as a datapoint
)



In [6]:
simpsons_data[0], simpsons_data[0].shape

(tensor([ 8905,  1797, 16820, 37977,  2538,    25, 34376,  5564,  9222,   319,
           281,  4946,  3764,   628,   198,  8905,  1797, 16820, 35683,    44,
         13153,    25,   220,   628,   628, 28440,   290,   337,  1376,  3708,
           284, 27874]), torch.Size([32]))

In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

In [8]:
collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])

collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50257]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [9]:
model_checkpoint = 'gpt2'

model = GPT2LMHeadModel.from_pretrained(model_checkpoint)  # load up a GPT2 model

pretrained_generator = pipeline(
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [10]:
training_args = TrainingArguments(
    output_dir="caffsean/gpt2-simpsons", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=3, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=len(simpsons_data.examples) // 5, # number of warmup steps for learning rate scheduler,
    logging_steps=50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    hub_token='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',
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=simpsons_data.examples[:int(len(simpsons_data.examples)*.8)],
    eval_dataset=simpsons_data.examples[int(len(simpsons_data.examples)*.8):]
)

trainer.evaluate()

Cloning https://huggingface.co/caffsean/gpt2-simpsons into local empty directory.


Download file pytorch_model.bin:   0%|          | 1.43k/487M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.37k/3.37k [00:00<?, ?B/s]

Clean file training_args.bin:  30%|##9       | 1.00k/3.37k [00:00<?, ?B/s]

Download file runs/Jan16_23-55-04_0e4e18baf59b/events.out.tfevents.1673913342.0e4e18baf59b.448.0:  25%|##4    …

Clean file runs/Jan16_23-55-04_0e4e18baf59b/events.out.tfevents.1673913342.0e4e18baf59b.448.0:   7%|7         …

Download file runs/Jan16_23-55-04_0e4e18baf59b/1673913357.1253624/events.out.tfevents.1673913357.0e4e18baf59b.…

Clean file runs/Jan16_23-55-04_0e4e18baf59b/1673913357.1253624/events.out.tfevents.1673913357.0e4e18baf59b.448…

Clean file pytorch_model.bin:   0%|          | 1.00k/487M [00:00<?, ?B/s]

***** Running Evaluation *****
  Num examples = 7449
  Batch size = 32


{'eval_loss': 4.73050594329834,
 'eval_runtime': 21.7074,
 'eval_samples_per_second': 343.155,
 'eval_steps_per_second': 10.734}

In [11]:
trainer.train()

***** Running training *****
  Num examples = 29793
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 2796
  Number of trainable parameters = 124439808


Epoch,Training Loss,Validation Loss
1,3.3683,3.433849
2,3.14,3.308241
3,3.0399,3.255533


***** Running Evaluation *****
  Num examples = 7449
  Batch size = 32


Saving model checkpoint to caffsean/gpt2-simpsons/checkpoint-932
Configuration saved in caffsean/gpt2-simpsons/checkpoint-932/config.json
Model weights saved in caffsean/gpt2-simpsons/checkpoint-932/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 7449
  Batch size = 32
Saving model checkpoint to caffsean/gpt2-simpsons/checkpoint-1864
Configuration saved in caffsean/gpt2-simpsons/checkpoint-1864/config.json
Model weights saved in caffsean/gpt2-simpsons/checkpoint-1864/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 7449
  Batch size = 32
Saving model checkpoint to caffsean/gpt2-simpsons/checkpoint-2796
Configuration saved in caffsean/gpt2-simpsons/checkpoint-2796/config.json
Model weights saved in caffsean/gpt2-simpsons/checkpoint-2796/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from caffsean/gpt2-simpsons/checkpoint-2796 (score: 3.25553297996521).


TrainOutput(global_step=2796, training_loss=3.3700372644077894, metrics={'train_runtime': 939.8141, 'train_samples_per_second': 95.103, 'train_steps_per_second': 2.975, 'total_flos': 1459626283008000.0, 'train_loss': 3.3700372644077894, 'epoch': 3.0})

In [12]:
trainer.push_to_hub()

Saving model checkpoint to caffsean/gpt2-simpsons
Configuration saved in caffsean/gpt2-simpsons/config.json
Model weights saved in caffsean/gpt2-simpsons/pytorch_model.bin
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 3.30k/487M [00:00<?, ?B/s]

Upload file runs/Jan17_16-28-13_69cd243a711a/events.out.tfevents.1673973137.69cd243a711a.340.0:  24%|##4      …

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/caffsean/gpt2-simpsons
   3166e76..3ca6644  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/caffsean/gpt2-simpsons
   3166e76..3ca6644  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
To https://huggingface.co/caffsean/gpt2-simpsons
   3ca6644..980cc31  main -> main

   3ca6644..980cc31  main -> main



'https://huggingface.co/caffsean/gpt2-simpsons/commit/3ca6644dde4a568f1d95d6e3bed2be6cb13d7024'