# Training GPT-Wee


Imports:

In [1]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer)
from tqdm import tqdm 
from tokenizers.normalizers import Lowercase, Strip, StripAccents, NFD
from datasets import load_dataset
import torch

### Tokenizer

In [83]:
tokenizer = Tokenizer(models.BPE())

In [84]:
normalizer = normalizers.Sequence([NFD(), Lowercase(), Strip(), StripAccents()])

In [85]:
tokenizer.normalizer = normalizer

In [86]:
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

In [88]:
textfiles = ["babylm_data/aochildes_with_few_interruptions.train",
                 "babylm_data/babylm_100M/bnc_spoken.train",
                 "babylm_data/babylm_100M/cbt.train",
                 "babylm_data/babylm_100M/children_stories.train",
                 "babylm_data/babylm_100M/gutenberg.train",
                 "babylm_data/babylm_100M/open_subtitles.train",
                 "babylm_data/babylm_100M/qed.train",
                 "babylm_data/babylm_100M/simple_wikipedia.train",
                 "babylm_data/babylm_100M/switchboard.train",
                 "babylm_data/babylm_100M/wikipedia.train"]

In [89]:
trainer = trainers.BpeTrainer(vocab_size=8000, special_tokens=["<|endoftext|>"])

In [90]:
tokenizer.train(files = textfiles, trainer=trainer)






In [92]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)

In [95]:
tokenizer.decoder = decoders.ByteLevel()

In [97]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>")

In [98]:
wrapped_tokenizer.save_pretrained("gpt-wee-tokenizer")

tokenizer config file saved in gpt-wee-tokenizer-few-interruptions-v2/tokenizer_config.json
Special tokens file saved in gpt-wee-tokenizer-few-interruptions-v2/special_tokens_map.json


('gpt-wee-tokenizer-few-interruptions-v2/tokenizer_config.json',
 'gpt-wee-tokenizer-few-interruptions-v2/special_tokens_map.json',
 'gpt-wee-tokenizer-few-interruptions-v2/tokenizer.json')

### Training 

Load tokenizer:

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt-wee-tokenizer")

2025-04-30 15:35:36.260685: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-30 15:35:36.866006: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-30 15:35:45.865892: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-04-30 15:35:45.866178: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

#### For regular learning:

Load files:

In [3]:
training_files = ["babylm_data/aochildes_with_few_interruptions.train",
                  "babylm_data/babylm_100M/cbt.train",
                 "babylm_data/babylm_100M/children_stories.train",
                 "babylm_data/babylm_100M/gutenberg.train",
                 "babylm_data/babylm_100M/bnc_spoken.train",
                 "babylm_data/babylm_100M/open_subtitles.train",
                 "babylm_data/babylm_100M/qed.train",
                 "babylm_data/babylm_100M/simple_wikipedia.train",
                 "babylm_data/babylm_100M/switchboard.train",
                 "babylm_data/babylm_100M/wikipedia.train"]

eval_files = ["babylm_data/babylm_dev/aochildes.dev",
             "babylm_data/babylm_dev/cbt.dev",
             "babylm_data/babylm_dev/children_stories.dev",
             "babylm_data/babylm_dev/gutenberg.dev",
             "babylm_data/babylm_dev/bnc_spoken.dev",
             "babylm_data/babylm_dev/open_subtitles.dev",
             "babylm_data/babylm_dev/qed.dev",
             "babylm_data/babylm_dev/simple_wikipedia.dev",
             "babylm_data/babylm_dev/switchboard.dev",
             "babylm_data/babylm_dev/wikipedia.dev"]

In [4]:
raw_datasets = load_dataset('text', data_files={'train': training_files, 
                                           'validation': eval_files})

Using custom data configuration default-130374ec83266a28
Found cached dataset text (/home/sammartj/.cache/huggingface/datasets/text/default-130374ec83266a28/0.0.0/99cc88223027054f94ce0c7fd69d10eb172910fa0615671283a3c8e5e7af2f9c)


  0%|          | 0/2 [00:00<?, ?it/s]

Load training data in ```streaming```-mode, so that it gets loaded progressively (quick and dirty implementation of curriculum ordering)

In [None]:
training_files = ["/home/hamelbur/ordered_text.txt"]

Ordered text from ```sentence_scoring.ipynb```

In [None]:
raw_datasets = load_dataset("text", data_files={"train": training_files, 
                                           "validation": eval_files}, streaming = True)

In [None]:
raw_datasets

### Create batches

In [5]:
context_length = 128

In [6]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_embd = 128,
    n_layer = 2,
    n_head = 2,
)


In [7]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=False,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        #if length == context_length:
        input_batch.append(input_ids)
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)
tokenized_datasets

Loading cached processed dataset at /home/sammartj/.cache/huggingface/datasets/text/default-130374ec83266a28/0.0.0/99cc88223027054f94ce0c7fd69d10eb172910fa0615671283a3c8e5e7af2f9c/cache-94755f435224b316.arrow


  0%|          | 0/179 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 2107266
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 178747
    })
})

In [9]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [13]:
from transformers import Trainer, TrainingArguments
from datasets import Dataset



args = TrainingArguments(
    output_dir="toy_model_outputs",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=5_000,
    logging_steps=5_000,
    gradient_accumulation_steps=8,
    #num_train_epochs=10,
    num_train_epochs = 10, # just for testing!
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    #use_mps_device=True, # enable when training on Mac with Apple Silicon
    # max_steps = 2000, # enable for curriculum learning, disable for normal
    # max_steps = 2000,  # just for testing!
    fp16=True,


)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],#[:8000]['input_ids'],
    eval_dataset=tokenized_datasets['validation'],#[:2000]['input_ids'],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 2107266
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 1024
  Gradient Accumulation steps = 8
  Total optimization steps = 20580
  Number of trainable parameters = 1551872


Step,Training Loss,Validation Loss


In [None]:
trainer.save_model('gpt-wee-model')