# Custom Collators For Supervised Finetuning/Instruction Tuning LLMs

In [1]:
!pip install -q trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.7/226.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.4/102.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

In [60]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import DataCollatorForCompletionOnlyLM
from typing import List

import torch

In [37]:
class CustomDataCollatorForCompletionOnlyLM(DataCollatorForCompletionOnlyLM):
    def __init__(self, response_template: str, tokenizer: AutoTokenizer, ignore_token_ids: List[int], ignore_tokens_mask_prob: float = 0.8):
        """
        A custom data collator that masks tokens before the response template and
        the ignore token ids after the response template. This is useful for classification tasks or tasks
        where the LM predicts a fixed/small number of tokens after the response template.

        Args:
            response_template (str): A string that indicates the start of an AI generated response.
            tokenizer (AutoTokenizer): The tokenizer used to tokenize the input text.
            ignore_token_ids (List[int]): A list of token ids that can be ignored by the model while computing the loss.
            ignore_tokens_mask_prob (float, optional): The probability with which an ignore token will be masked (i.e. loss is ignored).
                Defaults to 0.8.
        """
        super().__init__(tokenizer = tokenizer, response_template = response_template)
        self.ignore_token_ids = torch.tensor(ignore_token_ids)
        self.ignore_tokens_mask_prob = ignore_tokens_mask_prob

    def torch_call(self, examples: List[List[int]]):
        batch = super().torch_call(examples)
        # Create a mask with the same shape as the input_ids tensor and probability ignore_tokens_mask_prob
        mask = torch.bernoulli(torch.full(batch['labels'].shape, self.ignore_tokens_mask_prob)).bool()
        # Find the positions of the ignore tokens in the labels tensor
        ignore_token_positions = torch.isin(batch['labels'], self.ignore_token_ids)
        # Set the labels of the ignore tokens to -100 (i.e. ignore them in the loss computation)
        batch['labels'][mask & ignore_token_positions] = -100
        return batch


In [6]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Let's load our tokenizer and create a sample input.

In [70]:
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [71]:
sample_chat = [
  {"role": "system", "content": "Respond with YES/NO only."},
  {"role": "user", "content": "Is London the capital of England?"},
  {"role": "assistant", "content": "YES"}
]

In [87]:
tokenizer.decode(tokenizer.apply_chat_template(sample_chat))

'<|system|>\nRespond with YES/NO only.</s> \n<|user|>\nIs London the capital of England?</s> \n<|assistant|>\nYES</s> \n'

Notice all the tokens after the response "YES", these tokens are independent to our input documents and can therefore be learned easily
by our LLM. The ease at which they can be predicted has an adverse affect on the loss for the "YES/NO" tokens that the model needs to learn
to predict, because the trainer averages the loss over all tokens in the sequence.

In [89]:
# Let's ignore the tokens after "YES/NO" above.
ignore_tokens = tokenizer("</s>\n")["input_ids"]

In [90]:
ignore_tokens

[1, 2, 29871, 13]

In [7]:
# Notice that the <s> token is a special token added to the start of any input, so we don't need to consider this in our ignore tokens.
tokenizer.convert_ids_to_tokens(ignore_tokens)

NameError: name 'tokenizer' is not defined

In [92]:
tokenizer.convert_ids_to_tokens([29871])

['▁']

In [108]:
# The loss on all tokens before the assistant tag should be ignored.
response_template="\n<|assistant|>"
custom_collator = CustomDataCollatorForCompletionOnlyLM(response_template=response_template, ignore_token_ids=ignore_tokens[1:], tokenizer=tokenizer, ignore_tokens_mask_prob=1.0)
collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)

In [109]:
sample_input = tokenizer.apply_chat_template(sample_chat)
collated_input = collator([sample_input])
custom_collated_input = custom_collator([sample_input])

Notice how the standard `DataCollatorForCompletionOnlyLM` does not mask out the loss on the tokens at the end. Token 21143 corresponds to "YES" however, we see that we have 4 other tokens on which the loss is computed.

In [110]:
collated_input

{'input_ids': tensor([[  529, 29989,  5205, 29989, 29958,    13,  1666,  2818,   411, 22483,
          29914,  6632,   871, 29889,     2, 29871,    13, 29966, 29989,  1792,
          29989, 29958,    13,  3624,  4517,   278,  7483,   310,  5408, 29973,
              2, 29871,    13, 29966, 29989,   465, 22137, 29989, 29958,    13,
          21143,     2, 29871,    13]]),
 'labels': tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,    13,
          21143,  -100, 29871,    13]])}

Our custom collator has the desired behavior and computes the loss only on the "YES" token.

In [111]:
custom_collated_input

{'input_ids': tensor([[  529, 29989,  5205, 29989, 29958,    13,  1666,  2818,   411, 22483,
          29914,  6632,   871, 29889,     2, 29871,    13, 29966, 29989,  1792,
          29989, 29958,    13,  3624,  4517,   278,  7483,   310,  5408, 29973,
              2, 29871,    13, 29966, 29989,   465, 22137, 29989, 29958,    13,
          21143,     2, 29871,    13]]),
 'labels': tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
           -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          21143,  -100,  -100,  -100]])}

In [95]:
model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The loss from our custom collator is much higher than that of the default `DataCollatorForCompletionOnlyLM`. This is a result of the averaging over the tokens at the end like `</s>` which the model can easily predict.

In [115]:
model(**collated_input).loss

tensor(1.6897, grad_fn=<NllLossBackward0>)

In [116]:
model(**custom_collated_input).loss

tensor(6.7585, grad_fn=<NllLossBackward0>)