In [1]:
from bs4 import BeautifulSoup
import requests
import os

url = "https://www.gutenberg.org/cache/epub/19994/pg19994.txt"
response = requests.get(url)
text = response.text

os.makedirs("data", exist_ok=True)

with open("data/train.txt", "w", encoding="utf-8") as f:
    f.write(text)

print("Downloaded small text file successfully!")


Downloaded small text file successfully!


In [2]:
! pip install transformers datasets accelerate sentencepiece evaluate


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [3]:
import os
from dataclasses import dataclass
from typing import Optional
import torch
from datasets import load_dataset
from transformers import (
    GPT2TokenizerFast,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

In [4]:
# 1. Load tokenizer and model
# -----------------------------
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no padding token

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.config.pad_token_id = tokenizer.eos_token_id


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
# 2. Load text dataset
# -----------------------------
dataset = load_dataset("text", data_files={"train": "data/train.txt"})["train"]


Generating train split: 0 examples [00:00, ? examples/s]

In [6]:
# 3. Tokenize dataset
# -----------------------------
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=16)

tokenized_ds = dataset.map(tokenize, batched=True, remove_columns=["text"])
tokenized_ds.set_format(type="torch")

Map:   0%|          | 0/4959 [00:00<?, ? examples/s]

In [7]:
# 4. Data collator (for next-token prediction)
# -----------------------------
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,   # because GPT-2 is NOT a masked LM
)

In [8]:
# 5. Training settings
# -----------------------------
training_args = TrainingArguments(
    output_dir="outputs/simple-gpt2",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=1,   # CPU-friendly
    logging_steps=20,
    save_steps=200,
    fp16=False,
)


In [9]:
# 6. Trainer
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(


In [10]:
# 7. Train
# -----------------------------
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
20,3.1316
40,3.9395
60,4.1041
80,4.2095
100,3.5521
120,3.4169
140,3.507
160,3.3026
180,3.312
200,3.4284


TrainOutput(global_step=9918, training_loss=2.5892705782740126, metrics={'train_runtime': 4546.8987, 'train_samples_per_second': 2.181, 'train_steps_per_second': 2.181, 'total_flos': 80984199168000.0, 'train_loss': 2.5892705782740126, 'epoch': 2.0})

In [18]:

# 8. Save model
# -----------------------------
trainer.save_model("model/my_model")             # model files saved here
tokenizer.save_pretrained("model/my_tokenizer")  # tokenizer files saved here

print("\n🎉 Fine-tuning complete! Model saved in outputs/simple-gpt2")



🎉 Fine-tuning complete! Model saved in outputs/simple-gpt2


In [23]:
! zip -r my_finetuned_model.zip model/my_model model/my_tokenizer


  adding: model/my_model/ (stored 0%)
  adding: model/my_model/merges.txt (deflated 53%)
  adding: model/my_model/special_tokens_map.json (deflated 60%)
  adding: model/my_model/vocab.json (deflated 59%)
  adding: model/my_model/config.json (deflated 52%)
  adding: model/my_model/tokenizer.json (deflated 82%)
  adding: model/my_model/training_args.bin (deflated 53%)
  adding: model/my_model/model.safetensors (deflated 7%)
  adding: model/my_model/tokenizer_config.json (deflated 54%)
  adding: model/my_model/generation_config.json (deflated 31%)
  adding: model/my_tokenizer/ (stored 0%)
  adding: model/my_tokenizer/merges.txt (deflated 53%)
  adding: model/my_tokenizer/special_tokens_map.json (deflated 60%)
  adding: model/my_tokenizer/vocab.json (deflated 59%)
  adding: model/my_tokenizer/tokenizer.json (deflated 82%)
  adding: model/my_tokenizer/tokenizer_config.json (deflated 54%)


In [24]:
from google.colab import files
files.download("my_finetuned_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>