# SFT DistilGPT-2 to make Physics Haiku Bot

Requires:
- train_data.jsonl from data/train/
- eval_data.jsonl from data/eval/

In [1]:
# install for Colab use
!pip -q install transformers datasets accelerate evaluate trl

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/518.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m518.9/518.9 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25h

First, we create the Datasets for training and evaluation from the pre-formatted, good haiku data.

In [2]:
import json
from datasets import Dataset, DatasetDict, disable_progress_bars

def jsonl_to_list(filename):
    """Helper function to read JSONL into [{}, {}, ... ] where each dict is a haiku sample"""
    with open(filename, "r", encoding="utf-8") as f:
        return [json.loads(l) for l in f if l.strip()] # ignores whitespace lines

ds = DatasetDict(
    train = Dataset.from_list(jsonl_to_list("train_data.jsonl")), # train has 1526 good haikus
    eval = Dataset.from_list(jsonl_to_list("eval_data.jsonl")), # eval has 157
)

Next, we tokenize the datasets.

In [None]:
import torch
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers.utils import logging
logging.disable_progress_bar() # progress bars don't render well in IDE / GitHub

base_name="distilgpt2"
tokenizer=AutoTokenizer.from_pretrained(base_name)
if tokenizer.pad_token is None: # Trainer needs a pad token for batching
    tokenizer.pad_token = tokenizer.eos_token
collator = DataCollatorWithPadding(tokenizer=tokenizer)


def tokenize_and_mask(batch):
    # batch is a dict of lists because batched=True
    input_ids_list = []
    attention_mask_list = []
    labels_list = []

    for prompt, response in zip(batch["prompt"], batch["response"]):
        prompt = prompt.strip()
        response = response.strip()

        # full sequence the model will read
        full_text = prompt + "\n" + response + tokenizer.eos_token
        # prompt prefix length (we include the newline so the first response token isn't masked)
        prompt_text = prompt + "\n"

        full_enc = tokenizer(full_text, truncation=True, max_length=256)
        prompt_enc = tokenizer(prompt_text, truncation=True, max_length=256)

        input_ids = full_enc["input_ids"]
        attn = full_enc["attention_mask"]

        prompt_len = len(prompt_enc["input_ids"])

        # labels = input_ids, but ignore the prompt tokens
        labels = input_ids.copy()
        for i in range(min(prompt_len, len(labels))):
            labels[i] = -100 # Note to self: CrossEntropyLoss 's default ignore_index is -100! thus, only response text contributes to loss

        input_ids_list.append(input_ids)
        attention_mask_list.append(attn)
        labels_list.append(labels)

    return {
        "input_ids": input_ids_list,
        "attention_mask": attention_mask_list,
        "labels": labels_list,
    }

disable_progress_bars() # progress bars don't render well in IDE / GitHub
tok = ds.map(tokenize_and_mask, batched=True, remove_columns=ds["train"].column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DataCollatorForLanguageModeling and DataCollatorWithPadding don't easily work since we included loss from only the response section of each haiku's text. The former would overwrite our previously-set prompt ignore labels while the latter throws errors. A custom collator is simpler.

In [4]:
class CollatorPadLabelsToIgnoreIndex:
    """
    Data collator that pads variable-length batches and constructs pad mask/labels manually.
    Need it because we construct labels manually above (instead of using DataCollatorForLanguageModeling).
    """
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        max_len = max(len(ex["input_ids"]) for ex in batch) # length of longest sample in batch

        input_ids = []
        attention_mask = []
        labels = []

        for ex in batch:
            pad_len = max_len - len(ex["input_ids"])
            input_ids.append(ex["input_ids"] + [self.tokenizer.pad_token_id] * pad_len)
            attention_mask.append(ex["attention_mask"] + [0] * pad_len)
            labels.append(ex["labels"] + [-100] * pad_len) # Note to self: CrossEntropyLoss 's default ignore_index is -100!

        return {
            "input_ids": torch.tensor(input_ids, dtype = torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype = torch.long),
            "labels": torch.tensor(labels, dtype = torch.long),
        }

collator = CollatorPadLabelsToIgnoreIndex(tokenizer)

Finally, we load in the base model, set the training settings, and get to training!

In [None]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, EarlyStoppingCallback

device = "cuda" if torch.cuda.is_available() else "cpu"

sft_model = AutoModelForCausalLM.from_pretrained(base_name) # start from DistilGPT-2 with a causal LM head

# Align model + generation configs to the tokenizer
sft_model.config.pad_token_id = tokenizer.pad_token_id
sft_model.generation_config.pad_token_id = tokenizer.pad_token_id

sft_model.config.eos_token_id = tokenizer.eos_token_id
sft_model.generation_config.eos_token_id = tokenizer.eos_token_id

# GPT-2/DistilGPT-2 often uses EOS as BOS; set if needed
if tokenizer.bos_token_id is not None:
    sft_model.config.bos_token_id = tokenizer.bos_token_id
    sft_model.generation_config.bos_token_id = tokenizer.bos_token_id

sft_model.to(device)

sft_args = TrainingArguments(
    output_dir="haiku_bot",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    learning_rate=2e-5, # early overfitting for 5e-5
    eval_strategy="steps",
    eval_steps=25,
    save_steps=25,
    save_total_limit=2,
    logging_steps=25,
    fp16=torch.cuda.is_available(),
    report_to="none", # don't send logs anywhere
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

sft_trainer=Trainer(
    model=sft_model,
    args=sft_args,
    train_dataset=tok["train"],
    eval_dataset=tok["eval"],
    processing_class=tokenizer,
    data_collator=collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # don't waste free GPU time!
)

# Let's do some training!
sft_trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
25,3.9362,3.05667
50,3.0929,2.901514
75,2.9101,2.842354
100,2.8508,2.775371
125,2.6957,2.743392
150,2.6851,2.747133
175,2.6574,2.720445
200,2.575,2.724077
225,2.517,2.722208


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=225, training_loss=2.8800079684787327, metrics={'train_runtime': 103.4656, 'train_samples_per_second': 147.489, 'train_steps_per_second': 9.278, 'total_flos': 46009662603264.0, 'train_loss': 2.8800079684787327, 'epoch': 2.345549738219895})

In [6]:
save_dir = "./haiku_bot_final"
sft_trainer.save_model(save_dir)