In [None]:
!pip install transformers

# Finetuning GPT on Economist Articles

This is an exercise in using Huggingface to finetune GPT.

In [2]:
from transformers import GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, pipeline, \
                         Trainer, TrainingArguments

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [4]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Using pad_token, but it is not set yet.


In [5]:
economist_data = TextDataset(
    tokenizer=tokenizer,
    file_path='economist_training_data.txt',  # Simpson episode summaries
    block_size=32  # length of each chunk of text to use as a datapoint
)



In [6]:
economist_data[0], economist_data[0].shape

(tensor([ 2943,  1340,  2662,  8808, 48301, 28662,  2149,    25, 49463,   628,
           198, 32541, 16289, 30076,    25, 24903,   281,   435,    76, 14400,
          3034,  6380,    13,   554,  3269, 10681,   373,   379,   220,   198,
            64,  1936]), torch.Size([32]))

In [7]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,  # MLM is Masked Language Modelling
)

In [8]:
collator_example = data_collator([tokenizer('I am an input'), tokenizer('So am I')])

collator_example

{'input_ids': tensor([[   40,   716,   281,  5128],
        [ 2396,   716,   314, 50257]]), 'attention_mask': tensor([[1, 1, 1, 1],
        [1, 1, 1, 0]]), 'labels': tensor([[  40,  716,  281, 5128],
        [2396,  716,  314, -100]])}

In [9]:
model_checkpoint = 'gpt2'

model = GPT2LMHeadModel.from_pretrained(model_checkpoint)  # load up a GPT2 model

pretrained_generator = pipeline(
    'text-generation', model=model, tokenizer='gpt2',
    config={'max_length': 200, 'do_sample': True, 'top_p': 0.9, 'temperature': 0.7, 'top_k': 10}
)

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [10]:
training_args = TrainingArguments(
    output_dir="caffsean/gpt2-the-economist", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=5, # number of training epochs
    per_device_train_batch_size=32, # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=len(economist_data.examples) // 5, # number of warmup steps for learning rate scheduler,
    logging_steps=50,
    load_best_model_at_end=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    hub_token='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',
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=economist_data.examples[:int(len(economist_data.examples)*.8)],
    eval_dataset=economist_data.examples[int(len(economist_data.examples)*.8):]
)

trainer.evaluate()

Cloning https://huggingface.co/caffsean/gpt2-the-economist into local empty directory.
***** Running Evaluation *****
  Num examples = 9821
  Batch size = 32


{'eval_loss': 5.7519211769104,
 'eval_runtime': 27.7917,
 'eval_samples_per_second': 353.379,
 'eval_steps_per_second': 11.046}

In [11]:
trainer.train()

***** Running training *****
  Num examples = 39280
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 6140
  Number of trainable parameters = 124439808


Epoch,Training Loss,Validation Loss
1,3.8737,3.796029
2,3.6767,3.654353
3,3.5561,3.594771
4,3.431,3.549481
5,3.3127,3.528477


***** Running Evaluation *****
  Num examples = 9821
  Batch size = 32


Saving model checkpoint to caffsean/gpt2-the-economist/checkpoint-1228
Configuration saved in caffsean/gpt2-the-economist/checkpoint-1228/config.json
Model weights saved in caffsean/gpt2-the-economist/checkpoint-1228/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 9821
  Batch size = 32
Saving model checkpoint to caffsean/gpt2-the-economist/checkpoint-2456
Configuration saved in caffsean/gpt2-the-economist/checkpoint-2456/config.json
Model weights saved in caffsean/gpt2-the-economist/checkpoint-2456/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 9821
  Batch size = 32
Saving model checkpoint to caffsean/gpt2-the-economist/checkpoint-3684
Configuration saved in caffsean/gpt2-the-economist/checkpoint-3684/config.json
Model weights saved in caffsean/gpt2-the-economist/checkpoint-3684/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 9821
  Batch size = 32
Saving model checkpoint to caffsean/gpt2-the-economist/checkpoint-4912
Configurati

TrainOutput(global_step=6140, training_loss=3.69785606309723, metrics={'train_runtime': 2156.3223, 'train_samples_per_second': 91.081, 'train_steps_per_second': 2.847, 'total_flos': 3207359692800000.0, 'train_loss': 3.69785606309723, 'epoch': 5.0})

In [12]:
trainer.push_to_hub()

Saving model checkpoint to caffsean/gpt2-the-economist
Configuration saved in caffsean/gpt2-the-economist/config.json
Model weights saved in caffsean/gpt2-the-economist/pytorch_model.bin
Several commits (2) will be pushed upstream.
The progress bars may be unreliable.


Upload file pytorch_model.bin:   0%|          | 32.0k/487M [00:00<?, ?B/s]

Upload file runs/Jan18_07-35-38_f023be7e1adc/events.out.tfevents.1674027380.f023be7e1adc.132.0: 100%|#########…

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/caffsean/gpt2-the-economist
   0c7bc63..605e92c  main -> main

remote: LFS file scan complete.        
To https://huggingface.co/caffsean/gpt2-the-economist
   0c7bc63..605e92c  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
To https://huggingface.co/caffsean/gpt2-the-economist
   605e92c..14011cd  main -> main

   605e92c..14011cd  main -> main



'https://huggingface.co/caffsean/gpt2-the-economist/commit/605e92c62cd6ef6497c19e449f0314c77add55d3'