<a href="https://colab.research.google.com/github/rajaranjith/HCL-GenAI-Training/blob/main/Assignment_3_Gold_Badge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Cell 1 — Install dependencies
!pip -q install -U transformers datasets tokenizers accelerate evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/512.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m471.0/512.3 kB[0m [31m13.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
#Cell 2 — Imports + basic setup
import os
import math
import random
from dataclasses import dataclass
from itertools import chain

import torch
from datasets import load_dataset

from tokenizers import ByteLevelBPETokenizer
from transformers import (
    GPT2Config,
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    set_seed,
)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Reproducibility
SEED = 42
set_seed(SEED)
random.seed(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch:", torch.__version__)
print("Device:", device)
if device == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))



Torch: 2.9.0+cpu
Device: cpu


In [4]:
#Cell 3 — Load dataset (WikiText-103 by default)
#WikiText is derived from “good”/featured Wikipedia articles, and is a common lightweight stand-in for Wikipedia text.
# You can switch to "wikitext-2-raw-v1" for a much smaller/faster run.
DATASET_NAME = "wikitext"
DATASET_CONFIG = "wikitext-103-raw-v1"   # or "wikitext-2-raw-v1"

raw_datasets = load_dataset(DATASET_NAME, DATASET_CONFIG)
print(raw_datasets)

# Optional: reduce dataset size for quick experiments
# raw_datasets["train"] = raw_datasets["train"].select(range(20000))
# raw_datasets["validation"] = raw_datasets["validation"].select(range(2000))
# raw_datasets["test"] = raw_datasets["test"].select(range(2000))

def is_nonempty(example):
    t = example["text"]
    return t is not None and len(t.strip()) > 0

raw_datasets = raw_datasets.filter(is_nonempty)
print("After filtering empty lines:")
print(raw_datasets)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

wikitext-103-raw-v1/test-00000-of-00001.(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00000-of-00002(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/train-00001-of-00002(…):   0%|          | 0.00/157M [00:00<?, ?B/s]

wikitext-103-raw-v1/validation-00000-of-(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


Filter:   0%|          | 0/4358 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3760 [00:00<?, ? examples/s]

After filtering empty lines:
DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 2891
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1165029
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 2461
    })
})


In [5]:
#Cell 4 — Train a tokenizer FROM SCRATCH (Byte-level BPE)
TOKENIZER_DIR = "tokenizer_bpe_wikitext"
os.makedirs(TOKENIZER_DIR, exist_ok=True)

# Train tokenizer on training text
bpe_tokenizer = ByteLevelBPETokenizer()

special_tokens = ["<s>", "<pad>", "</s>", "<unk>"]
vocab_size = 32_000

# Use an iterator to avoid unnecessary RAM spikes
def batch_iterator(dataset_split, batch_size=1000):
    texts = dataset_split["text"]
    for i in range(0, len(texts), batch_size):
        yield texts[i : i + batch_size]

bpe_tokenizer.train_from_iterator(
    batch_iterator(raw_datasets["train"], batch_size=1000),
    vocab_size=vocab_size,
    min_frequency=2,
    special_tokens=special_tokens,
)

# Save merges/vocab
bpe_tokenizer.save_model(TOKENIZER_DIR)

# Load into a Transformers "Fast" tokenizer
tokenizer = GPT2TokenizerFast(
    vocab_file=os.path.join(TOKENIZER_DIR, "vocab.json"),
    merges_file=os.path.join(TOKENIZER_DIR, "merges.txt"),
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
)

tokenizer.save_pretrained(TOKENIZER_DIR)

print("Tokenizer vocab size (len(tokenizer)):", len(tokenizer))
print("Special tokens:", tokenizer.special_tokens_map)

Tokenizer vocab size (len(tokenizer)): 32000
Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}


In [6]:
#Cell 5 — Tokenize + chunk into fixed blocks for causal LM
# Context length for training (keep moderate for Colab)
block_size = 256

def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    desc="Tokenizing",
)

def group_texts(examples):
    # Concatenate texts
    concatenated = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated["input_ids"])

    # Drop remainder
    total_length = (total_length // block_size) * block_size

    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated.items()
    }
    # Labels = input_ids for causal LM
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized.map(
    group_texts,
    batched=True,
    batch_size=1000,
    desc=f"Grouping into blocks of {block_size}",
)

print(lm_datasets)
print("Example item keys:", lm_datasets["train"][0].keys())

Tokenizing:   0%|          | 0/2891 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/1165029 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/2461 [00:00<?, ? examples/s]

Grouping into blocks of 256:   0%|          | 0/2891 [00:00<?, ? examples/s]

Grouping into blocks of 256:   0%|          | 0/1165029 [00:00<?, ? examples/s]

Grouping into blocks of 256:   0%|          | 0/2461 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1062
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 445095
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 933
    })
})
Example item keys: dict_keys(['input_ids', 'attention_mask', 'labels'])


In [7]:
#Cell 6 — Build a GPT-style Transformer model FROM SCRATCH
#Two preset sizes are included. Start with tiny if you want it to run very comfortably on Colab.
# Choose a "reasonably sized" model for Colab
MODEL_SIZE = "tiny"   # "tiny" or "mini"

if MODEL_SIZE == "tiny":
    # ~15–25M-ish params (roughly, depends on vocab)
    n_layer = 6
    n_head  = 6
    n_embd  = 384
elif MODEL_SIZE == "mini":
    # ~40–70M-ish params (roughly)
    n_layer = 8
    n_head  = 8
    n_embd  = 512
else:
    raise ValueError("MODEL_SIZE must be 'tiny' or 'mini'")

config = GPT2Config(
    vocab_size=len(tokenizer),
    n_positions=block_size,
    n_ctx=block_size,
    n_embd=n_embd,
    n_layer=n_layer,
    n_head=n_head,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

model = GPT2LMHeadModel(config)

# Helpful for memory on Colab:
model.gradient_checkpointing_enable()

model.to(device)

n_params = sum(p.numel() for p in model.parameters())
print(f"Model size: {MODEL_SIZE}")
print(f"Parameters: {n_params/1e6:.2f}M")

Model size: tiny
Parameters: 23.03M


In [2]:
#Cell 7 — Train (Trainer)
OUTPUT_DIR = "llm_from_scratch_wikitext"

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Transformers has had some argument-name shifts across versions.
# This try/except keeps it compatible with most installs.
train_args_common = dict(
    output_dir=OUTPUT_DIR,
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    per_device_train_batch_size=8 if device == "cuda" else 2,
    per_device_eval_batch_size=8 if device == "cuda" else 2,
    gradient_accumulation_steps=4,
    num_train_epochs=1,                 # increase for better results
    learning_rate=5e-4,
    warmup_steps=200,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    fp16=(device == "cuda"),            # works well on T4
    report_to="none",
    dataloader_num_workers=2,
)

try:
    training_args = TrainingArguments(
        **train_args_common,
        eval_strategy="steps",
        eval_steps=500,
    )
except TypeError:
    training_args = TrainingArguments(
        **train_args_common,
        evaluation_strategy="steps",
        eval_steps=500,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)

trainer.train()

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Saved to:", OUTPUT_DIR)

NameError: name 'DataCollatorForLanguageModeling' is not defined

In [None]:
#Cell 8 — Evaluate perplexity on validation + test
def safe_exp(x):
    try:
        return math.exp(x)
    except OverflowError:
        return float("inf")

val_metrics = trainer.evaluate(lm_datasets["validation"])
val_loss = val_metrics.get("eval_loss")
val_ppl = safe_exp(val_loss) if val_loss is not None else None
print("Validation loss:", val_loss)
print("Validation perplexity:", val_ppl)

test_metrics = trainer.evaluate(lm_datasets["test"])
test_loss = test_metrics.get("eval_loss")
test_ppl = safe_exp(test_loss) if test_loss is not None else None
print("Test loss:", test_loss)
print("Test perplexity:", test_ppl)

In [None]:
#Cell 9 — “Test” the LLM by generating text
model.eval()

def generate(prompt, max_new_tokens=80, temperature=0.9, top_p=0.95):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

prompts = [
    "In the future, artificial intelligence",
    "The history of mathematics begins with",
    "A recipe for making bread requires",
    "Wikipedia is an online encyclopedia that",
]

for p in prompts:
    print("="*80)
    print("PROMPT:", p)
    print("-"*80)
    print(generate(p))
