In [38]:
# 1) Upgrade libs
!pip install --upgrade transformers datasets accelerate -q

# 2) Check version (optional)
import transformers
print("transformers version:", transformers.__version__)


transformers version: 4.56.1


In [39]:
import os
from datasets import load_dataset
from transformers import (
    AutoTokenizer, GPT2Config, GPT2LMHeadModel,
    DataCollatorForLanguageModeling, Trainer, TrainingArguments
)

# Disable wandb (to avoid asking for API key)
os.environ["WANDB_DISABLED"] = "true"


In [40]:
# Load local big.txt file (must be in the Colab directory)
dataset_all = load_dataset("text", data_files={"data": "./big.txt"})

# Split into 80% train and 20% validation
split = dataset_all["data"].train_test_split(test_size=0.2, seed=42)

# Repack into dict
dataset = {
    "train": split["train"],
    "validation": split["test"]
}

# ✅ Limit dataset size for faster training
dataset["train"] = dataset["train"].select(range(5000))   # first 2000 samples
dataset["validation"] = dataset["validation"].select(range(100))  # first 500 samples

print("Train size:", len(dataset["train"]))
print("Validation size:", len(dataset["validation"]))
print("Train sample:", dataset["train"][0])
print("Validation sample:", dataset["validation"][0])


Train size: 5000
Validation size: 100
Train sample: {'text': ''}
Validation sample: {'text': 'having weakened in Moscow, and do so only because the results did'}


In [41]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=64,
        padding="max_length"
    )

tokenized_train = dataset["train"].map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val   = dataset["validation"].map(tokenize_function, batched=True, remove_columns=["text"])

# Filter empty sequences
tokenized_train = tokenized_train.filter(lambda x: len(x["input_ids"]) > 0)
tokenized_val   = tokenized_val.filter(lambda x: len(x["input_ids"]) > 0)

print("Tokenized sample:", tokenized_train[0])


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenized sample: {'input_ids': [50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [42]:
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=64,
    n_ctx=64,
    n_embd=132,   # small embedding size
    n_layer=6,    # fewer layers for faster training
    n_head=6      # must divide n_embd evenly
)

model = GPT2LMHeadModel(config)


In [43]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [44]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./tiny-gpt-from-big",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=20,
    # use these legacy args instead of evaluation_strategy/save_strategy
    eval_steps=10**9,    # effectively disables periodic evaluation
    save_steps=10**9,    # effectively disables periodic checkpointing
    learning_rate=5e-4,
    weight_decay=0.01,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,   # OK to pass, trainer won't run eval frequently
    data_collator=data_collator
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [45]:
trainer.train()


Step,Training Loss
20,9.5771
40,8.4903
60,7.5692
80,7.1625
100,7.2658
120,7.0205
140,7.1512
160,6.9473
180,6.8631
200,6.82


TrainOutput(global_step=625, training_loss=6.917976373291015, metrics={'train_runtime': 488.9188, 'train_samples_per_second': 10.227, 'train_steps_per_second': 1.278, 'total_flos': 2428968960000.0, 'train_loss': 6.917976373291015, 'epoch': 1.0})

In [46]:
trainer.save_model("./tiny-gpt-from-big-final")
tokenizer.save_pretrained("./tiny-gpt-from-big-final")


('./tiny-gpt-from-big-final/tokenizer_config.json',
 './tiny-gpt-from-big-final/special_tokens_map.json',
 './tiny-gpt-from-big-final/vocab.json',
 './tiny-gpt-from-big-final/merges.txt',
 './tiny-gpt-from-big-final/added_tokens.json',
 './tiny-gpt-from-big-final/tokenizer.json')

In [47]:
from transformers import GPT2LMHeadModel, AutoTokenizer

model = GPT2LMHeadModel.from_pretrained("./tiny-gpt-from-big-final")
tokenizer = AutoTokenizer.from_pretrained("./tiny-gpt-from-big-final")

prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt").input_ids

# Greedy generation
out1 = model.generate(inputs, max_length=50, do_sample=False)
print("Greedy:", tokenizer.decode(out1[0], skip_special_tokens=True))

# Sampling generation
out2 = model.generate(
    inputs, max_length=50, do_sample=True,
    top_k=50, top_p=0.95, temperature=0.9
)
print("Sampled:", tokenizer.decode(out2[0], skip_special_tokens=True))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Greedy: Once upon a time, and the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
Sampled: Once upon a time at the He with " was The Emperor." a I and the other you," a He or is the the this been which be I and the count; to had she to had had He of the They. the that was had


In [48]:
from google.colab import drive
drive.mount('/content/drive')

# Copy folder into your Google Drive
!cp -r tiny-gpt-from-big "/content/drive/MyDrive/"


Mounted at /content/drive


In [49]:
!cp -r tiny-gpt-from-big-final "/content/drive/MyDrive/"