# SFT of DistilGPT-2 to make Physics Haiku Bot 

Requires:
- train_haikus.jsonl
- eval_haikus.jsonl

In [1]:
# install for Colab use
!pip -q install transformers datasets accelerate evaluate pronouncing syllables trl

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.7/939.7 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pronouncing (setup.py) ... [?25l[?25hdone


First, we create the Datasets for training and evaluation from the pre-formatted, good haiku data.

In [2]:
import json
from datasets import Dataset, DatasetDict

def jsonl_to_list(filename):
    """Helper function to read JSONL into [{}, {}, ... ] where each dict is a haiku sample"""
    with open(filename, "r", encoding="utf-8") as f:
        return [json.loads(l) for l in f if l.strip()] # ignores whitespace lines

ds = DatasetDict(
    train = Dataset.from_list(jsonl_to_list("train_haikus.jsonl")), # train has 1504 good haikus
    eval = Dataset.from_list(jsonl_to_list("eval_haikus.jsonl")), # eval has 144
)

Next, we tokenize the datasets.

In [None]:
import torch
from transformers import AutoTokenizer, DataCollatorWithPadding

base_name="distilgpt2"
tokenizer=AutoTokenizer.from_pretrained(base_name)
if tokenizer.pad_token is None: # Trainer needs a pad token for batching
    tokenizer.pad_token = tokenizer.eos_token
collator = DataCollatorWithPadding(tokenizer=tokenizer)


def tokenize_and_mask(batch):
    # batch is a dict of lists because batched=True
    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    for prompt, response in zip(batch["prompt"], batch["response"]):
        prompt = prompt.strip()
        response = response.strip()

        # full sequence the model will read
        full_text = prompt + "\n" + response + tokenizer.eos_token
        # prompt prefix length (we include the newline so the first response token isn't masked)
        prompt_text = prompt + "\n"

        full_enc = tokenizer(full_text, truncation=True, max_length=256)
        prompt_enc = tokenizer(prompt_text, truncation=True, max_length=256)

        input_ids = full_enc["input_ids"]
        attn = full_enc["attention_mask"]

        prompt_len = len(prompt_enc["input_ids"])

        # labels = input_ids, but ignore the prompt tokens
        labels = input_ids.copy()
        for i in range(min(prompt_len, len(labels))):
            labels[i] = -100 # Note to self: CrossEntropyLoss 's default ignore_index is -100! thus, only response text contributes to loss

        input_ids_list.append(input_ids)
        attention_mask_list.append(attn)
        labels_list.append(labels)

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list,
    }

tok = ds.map(tokenize_and_mask, batched=True, remove_columns=ds["train"].column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/1504 [00:00<?, ? examples/s]

Map:   0%|          | 0/144 [00:00<?, ? examples/s]

DataCollatorForLanguageModeling and DataCollatorWithPadding don't easily work since we included loss from only the response section of each haiku's text. The former would overwrite our previously-set prompt ignore labels while the latter throws errors. A custom collator is simpler.

In [None]:
class CollatorPadLabelsToIgnoreIndex:
    """
    Data collator that pads variable-length batches and constructs pad mask/labels manually.
    Need it because we construct labels manually above (instead of using DataCollatorForLanguageModeling).
    """
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        max_len = max(len(ex["input_ids"]) for ex in batch) # length of longest sample in batch

        input_ids = []
        attention_mask = []
        labels = []

        for ex in batch:
            pad_len = max_len - len(ex["input_ids"])
            input_ids.append(ex["input_ids"] + [self.tokenizer.pad_token_id] * pad_len)
            attention_mask.append(ex["attention_mask"] + [0] * pad_len)
            labels.append(ex["labels"] + [-100] * pad_len) # Note to self: CrossEntropyLoss 's default ignore_index is -100!

        return {
            "input_ids": torch.tensor(input_ids, dtype = torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype = torch.long),
            "labels": torch.tensor(labels, dtype = torch.long),
        }

collator = CollatorPadLabelsToIgnoreIndex(tokenizer)

Finally, we load in the base model, set the training settings, and get to training!

In [10]:
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, EarlyStoppingCallback

device = "cuda" if torch.cuda.is_available() else "cpu"

sft_model = AutoModelForCausalLM.from_pretrained(base_name) # start from DistilGPT-2 with a causal LM head
sft_model.config.pad_token_id = tokenizer.eos_token_id
sft_model.to(device)

sft_args = TrainingArguments(
    output_dir="haiku_bot",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    learning_rate=2e-5, # early overfitting for 5e-5
    eval_strategy="steps",
    eval_steps=25,
    save_steps=25,
    save_total_limit=2,
    logging_steps=25,
    fp16=torch.cuda.is_available(),
    report_to="none", # don't send logs anywhere
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

sft_trainer=Trainer(
    model=sft_model,
    args=sft_args,
    train_dataset=tok["train"],
    eval_dataset=tok["eval"],
    tokenizer=tokenizer,
    data_collator=collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # don't waste free GPU time!
)

# Let's do some training!
sft_trainer.train()

  sft_trainer=Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Step,Training Loss,Validation Loss
25,3.9352,3.132473
50,3.1078,2.938739
75,2.9395,2.845661
100,2.8555,2.806916
125,2.705,2.780625
150,2.6935,2.768697
175,2.6377,2.75761
200,2.5911,2.747485
225,2.5155,2.753023
250,2.5039,2.765464


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=250, training_loss=2.848478225708008, metrics={'train_runtime': 154.7198, 'train_samples_per_second': 97.208, 'train_steps_per_second': 6.075, 'total_flos': 50283293442048.0, 'train_loss': 2.848478225708008, 'epoch': 2.6595744680851063})

In [11]:
save_dir = "./haiku_bot_final"
sft_trainer.save_model(save_dir)