# Finetune a DistilGPT2 model on the squad Dataset

Task Description: Causal Language Modelling (CLM) is text generation. Given a prompt/source sequence, CLM will generate words to continue the source sequence.

Original Tutorial: https://huggingface.co/docs/transformers/tasks/language_modeling

In [1]:
!pip install -q transformers datasets evaluate accelerate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/536.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m532.5/536.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

# Load ELI5 dataset

In [4]:
from datasets import load_dataset

squad = load_dataset("squad", split = "train[:5000]")

In [5]:
# Split the dataset into a train and test set
squad = squad.train_test_split(test_size=0.2)

In [6]:
# Look at the data
import pprint
pprint.pprint(squad['train'][0])

# The text column is our model input


{'answers': {'answer_start': [147], 'text': ['IGN and Nintendo Power']},
 'context': 'Twilight Princess received the awards for Best Artistic Design, '
            'Best Original Score, and Best Use of Sound from IGN for its '
            'GameCube version. Both IGN and Nintendo Power gave Twilight '
            'Princess the awards for Best Graphics and Best Story. Twilight '
            'Princess received Game of the Year awards from GameTrailers, '
            '1UP.com, Electronic Gaming Monthly, Game Informer, Games Radar, '
            'GameSpy, Spacey Awards, X-Play and Nintendo Power. It was also '
            'given awards for Best Adventure Game from the Game Critics '
            'Awards, X-Play, IGN, GameTrailers, 1UP.com, and Nintendo Power. '
            'The game was considered the Best Console Game by the Game Critics '
            'Awards and GameSpy. The game placed 16th in Official Nintendo '
            "Magazine's list of the 100 Greatest Nintendo Games of All Time.

In [7]:
# Preprocessing
## Load Model
from transformers import AutoTokenizer

checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [55]:
# We can use Pytorch to check how the model expects input features
from transformers import AutoTokenizer, AutoModelForCausalLM

checkpoint = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint)
text = "Replace me by any text you'd like."

print("raw_text:\n", text)
encoded_input = tokenizer(text, return_tensors='pt')
print("encoded_input:\n",encoded_input)
# Our input has the structure
'''
{"input_ids": tensor([[]]), 'attention_mask': tensor([[]])}
'''

output = model(**encoded_input)
print("encoded_output info:\n", output[0].size())
print("encoded_output cross_attentions:\n", output[1][5][0].size(), "\n", output[1][5][0][0][0][0])

raw_text:
 Replace me by any text you'd like.
encoded_input:
 {'input_ids': tensor([[3041, 5372,  502,  416,  597, 2420,  345, 1549,  588,   13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
encoded_output info:
 torch.Size([1, 10, 50257])
encoded_output cross_attentions:
 torch.Size([1, 12, 10, 64]) 
 tensor([-1.6596, -0.4525, -0.4043,  0.4165, -0.3927,  0.3022, -1.3839,  0.1537,
        -0.3212,  0.4050,  0.0679,  1.5069, -0.7856,  0.0427, -0.0568, -0.0537,
        -0.1971,  0.6658,  0.3180,  0.0612, -0.0650,  0.3988, -0.1632, -0.4194,
        -0.0431,  0.4369, -0.1753,  0.2339,  0.3642, -0.0088, -0.6940, -0.1146,
        -0.5413,  0.3660,  1.9697, -0.1645, -0.0774,  0.0869,  0.1880,  0.7955,
        -0.0170,  0.3241, -1.1941,  0.2124,  0.0390,  0.5911,  0.4587, -0.1098,
         0.4401,  2.8913, -0.4153, -0.3047,  0.2053,  0.0062, -0.0647,  0.5646,
        -0.1924,  0.2178,  0.4539, -0.0444, -0.3561,  0.3013,  0.3733, -0.4454],
       grad_fn=<SelectBackward0>)


# Preprocessing
We need to create a preprocess function that we will apply to every instance in the dataset. The preprocess function needs to:

1. Flatten the instance so that the text column is easily accessible
2. Join any list of strings
3. Tokenize result

Some token sequences will be **longer** than the maximum input length for the model. Hence we use a second preprocessing function to:

1. concatenate all token sequences
2. Split the concatenated sequences into shorter chunks defined by a `block_size` parameter.

In [9]:
# The text field is nested so we need to flatten each instance
squad = squad.flatten()
pprint.pprint(squad['train'][0])

{'answers.answer_start': [147],
 'answers.text': ['IGN and Nintendo Power'],
 'context': 'Twilight Princess received the awards for Best Artistic Design, '
            'Best Original Score, and Best Use of Sound from IGN for its '
            'GameCube version. Both IGN and Nintendo Power gave Twilight '
            'Princess the awards for Best Graphics and Best Story. Twilight '
            'Princess received Game of the Year awards from GameTrailers, '
            '1UP.com, Electronic Gaming Monthly, Game Informer, Games Radar, '
            'GameSpy, Spacey Awards, X-Play and Nintendo Power. It was also '
            'given awards for Best Adventure Game from the Game Critics '
            'Awards, X-Play, IGN, GameTrailers, 1UP.com, and Nintendo Power. '
            'The game was considered the Best Console Game by the Game Critics '
            'Awards and GameSpy. The game placed 16th in Official Nintendo '
            "Magazine's list of the 100 Greatest Nintendo Games of All T

In [10]:
# Operation to apply to every instance
print(" ".join(squad['train']['answers.text'][0]), "\n")
print(tokenizer(" ".join(squad['train']['answers.text'][0])) )

IGN and Nintendo Power 

{'input_ids': [16284, 290, 9714, 4333], 'attention_mask': [1, 1, 1, 1]}


In [12]:
# Wrap in a preprocess function
def preprocess_function(examples):
  return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [13]:
# Apply preprocessing over entire dataset - batched = True process multiple elements of the datasets
tokenized_squad = squad.map(preprocess_function, batched = True, num_proc=4, remove_columns=squad['train'].column_names)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [14]:
tokenized_squad['train'][0]

{'input_ids': [16284, 290, 9714, 4333], 'attention_mask': [1, 1, 1, 1]}

In [15]:
def group_texts(examples, block_size: int = 128):
  # This function is to cut the length of the text examples

  # Concatencate all texts
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
  total_length = len(concatenated_examples[list(examples.keys())[0]])

  if total_length >= block_size:
    total_length = (total_length // block_size) * block_size
  # Split by chunks of block size
  result = {
      k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
      for k, t in concatenated_examples.items()
  }
  result["labels"] = result["input_ids"].copy()
  return result


In [16]:
# Apply second preprocessing over entire dataset
lm_dataset = tokenized_squad.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
pprint.pprint(lm_dataset['train'][0])

In [17]:
# Create a batch of examples, with dynamic padding. Use the appropriate collator function
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

# Train using the Trainer API
The main training steps are:

1. Define training hyperparameters using a model specific TrainingArguments function. At the end of each epoch, the Trainer will evaluate the defined loss metric and save the training checkpoint.

2. Pass the training arguments to a Trainer function alongside the model, dataset, tokenizer, data collator.

3. Call train() to finetune the model

In [18]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained(checkpoint)

In [19]:
training_args = TrainingArguments(
    output_dir = "squad_clm",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,6.333879
2,No log,6.247723


TrainOutput(global_step=16, training_loss=6.6985015869140625, metrics={'train_runtime': 4.0655, 'train_samples_per_second': 56.574, 'train_steps_per_second': 3.936, 'total_flos': 7512281579520.0, 'train_loss': 6.6985015869140625, 'epoch': 2.0})

In [20]:
# Evaluate the fine tuned model and obtain the perplexity score
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 516.83


In [21]:
trainer.save_model("squad_causal_model")

In [22]:
# In this case, the tokenizer was not saved automatically, save it manually in the model folder for inference
tokenizer.save_pretrained("squad_causal_model", legacy_format=False)

('squad_causal_modell/tokenizer_config.json',
 'squad_causal_modell/special_tokens_map.json',
 'squad_causal_modell/tokenizer.json')

# Inference

Use model for inference using a pipeline wrapper

In [23]:
prompt = "Sherlock Holmes burst into the apartment, out of breath and harried, he looked around anxiously and found"

In [24]:
from transformers import pipeline

generator = pipeline("text-generation", model = "squad_causal_model")
generator(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Sherlock Holmes burst into the apartment, out of breath and harried, he looked around anxiously and found a few rooms in the upstairs apartment.\n\n\n\n"It\'s over now. It\'s not up to my little boy to fix'}]

In [27]:
# Inference Pipeline using Pytorch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("squad_causal_model")
inputs = tokenizer(prompt, return_tensors="pt").input_ids

model = AutoModelForCausalLM.from_pretrained("squad_causal_model")

# Generate method is used to generate text
outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [28]:
# Decode the generated token ids back into text
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Sherlock Holmes burst into the apartment, out of breath and harried, he looked around anxiously and found a pair of men in his room; the two men were asleep and their faces were in the same room. It was not a coincidence; he was an avid gambler who had won $1 million (in all) for an exotic casino company to be the first American to enter the U.S. after the Vietnam War, to win an Indian casino and win a national lottery. The only person that could make it through the system would have been the owner of an American lottery. But Holmes was']