<a href="https://colab.research.google.com/github/rajaranjith/HCL-GenAI-Training/blob/main/Assignment_3_Gold_Badge_Fast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Cell 1 — Install
!pip -q install -U transformers datasets tokenizers accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
#Cell 2 — Imports + setup
import os
import math
import torch
from itertools import chain
from datasets import load_dataset

from tokenizers import ByteLevelBPETokenizer
from transformers import (
    GPT2Config,
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    set_seed,
)

os.environ["TOKENIZERS_PARALLELISM"] = "false"

set_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
if device == "cuda":
    print("GPU:", torch.cuda.get_device_name(0))

# Small speed tweak on newer PyTorch (safe if it exists)
try:
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

Device: cuda
GPU: Tesla T4


  _C._set_float32_matmul_precision(precision)


In [3]:
#Cell 3 — Load a small Wikipedia-derived dataset subset (WikiText-2)
raw = load_dataset("wikitext", "wikitext-2-raw-v1")

def nonempty(x):
    return x["text"] and x["text"].strip()

raw = raw.filter(nonempty)

# HARD CAP the dataset for a smoke test:
TRAIN_LINES = 4000
VAL_LINES   = 500
TEST_LINES  = 500

raw["train"] = raw["train"].select(range(min(TRAIN_LINES, len(raw["train"]))))
raw["validation"] = raw["validation"].select(range(min(VAL_LINES, len(raw["validation"]))))
raw["test"] = raw["test"].select(range(min(TEST_LINES, len(raw["test"]))))

print(raw)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4358 [00:00<?, ? examples/s]

Filter:   0%|          | 0/36718 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 500
    })
    train: Dataset({
        features: ['text'],
        num_rows: 4000
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 500
    })
})


In [4]:
#Cell 4 — Train a tiny tokenizer FROM SCRATCH (byte-level BPE)
TOKENIZER_DIR = "smoke_tokenizer"
os.makedirs(TOKENIZER_DIR, exist_ok=True)

# Much smaller vocab for speed
VOCAB_SIZE = 8000
special_tokens = ["<s>", "<pad>", "</s>", "<unk>"]

bpe = ByteLevelBPETokenizer()

def batch_iterator(ds, batch_size=500):
    texts = ds["text"]
    for i in range(0, len(texts), batch_size):
        yield texts[i:i+batch_size]

bpe.train_from_iterator(
    batch_iterator(raw["train"], batch_size=500),
    vocab_size=VOCAB_SIZE,
    min_frequency=2,
    special_tokens=special_tokens,
)

bpe.save_model(TOKENIZER_DIR)

tokenizer = GPT2TokenizerFast(
    vocab_file=os.path.join(TOKENIZER_DIR, "vocab.json"),
    merges_file=os.path.join(TOKENIZER_DIR, "merges.txt"),
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
)

print("Tokenizer vocab size:", len(tokenizer))
print("Special tokens:", tokenizer.special_tokens_map)

Tokenizer vocab size: 8000
Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}


In [5]:
#Cell 5 — Tokenize + group into small blocks
# Short context for speed
block_size = 128

def tokenize_fn(examples):
    return tokenizer(examples["text"])

tok = raw.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
    desc="Tokenizing",
)

def group_texts(examples):
    concat = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_len = len(concat["input_ids"])
    total_len = (total_len // block_size) * block_size

    result = {
        k: [t[i:i+block_size] for i in range(0, total_len, block_size)]
        for k, t in concat.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm = tok.map(
    group_texts,
    batched=True,
    batch_size=1000,
    desc=f"Grouping into blocks of {block_size}",
)

print(lm)
print("Train blocks:", len(lm["train"]))

Tokenizing:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/4000 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/500 [00:00<?, ? examples/s]

Grouping into blocks of 128:   0%|          | 0/500 [00:00<?, ? examples/s]

Grouping into blocks of 128:   0%|          | 0/4000 [00:00<?, ? examples/s]

Grouping into blocks of 128:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 469
    })
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3281
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 393
    })
})
Train blocks: 3281


In [6]:
#Cell 6 — Build a very small GPT model FROM SCRATCH
# Extremely small model for smoke tests
config = GPT2Config(
    vocab_size=len(tokenizer),
    n_positions=block_size,
    n_ctx=block_size,
    n_embd=128,     # tiny embedding
    n_layer=2,      # 2 transformer blocks
    n_head=4,       # small #heads
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
)

model = GPT2LMHeadModel(config).to(device)

n_params = sum(p.numel() for p in model.parameters())
print(f"Params: {n_params/1e6:.2f}M")


Params: 1.44M


In [7]:
#Cell 7 — Train quickly (fixed number of steps)
#Key changes for speed:
#max_steps is small
#no eval during training
#no checkpoint saving during training
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

OUTPUT_DIR = "smoke_llm"

# Keep this small for a smoke test
MAX_STEPS = 200

train_args_common = dict(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=16 if device == "cuda" else 4,
    gradient_accumulation_steps=1,
    learning_rate=5e-4,
    warmup_steps=20,
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    max_steps=MAX_STEPS,           # <<<<<< key for fast run
    logging_steps=20,
    report_to="none",
    fp16=(device == "cuda"),
    save_strategy="no",            # <<<<<< avoid save overhead
)

# Handle transformers arg name differences
try:
    training_args = TrainingArguments(**train_args_common, eval_strategy="no")
except TypeError:
    training_args = TrainingArguments(**train_args_common, evaluation_strategy="no")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm["train"],
    data_collator=data_collator,
)

trainer.train()

# Save once at the end (optional, but useful)
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Saved to:", OUTPUT_DIR)

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
20,8.7435
40,8.0202
60,7.4118
80,7.1296
100,7.0283
120,6.9967
140,6.9577
160,6.8968
180,6.8942
200,6.8702


Saved to: smoke_llm


In [8]:
#Cell 8 — Quick evaluation (optional)
def safe_exp(x):
    try:
        return math.exp(x)
    except OverflowError:
        return float("inf")

# quick eval on validation + test (small subsets)
val_metrics = trainer.evaluate(lm["validation"])
val_loss = val_metrics.get("eval_loss")
print("Val loss:", val_loss, " Val ppl:", safe_exp(val_loss) if val_loss is not None else None)

test_metrics = trainer.evaluate(lm["test"])
test_loss = test_metrics.get("eval_loss")
print("Test loss:", test_loss, " Test ppl:", safe_exp(test_loss) if test_loss is not None else None)

Val loss: 6.86379337310791  Val ppl: 956.9904124918183
Test loss: 6.892701625823975  Test ppl: 985.0590866542359


In [9]:
#Cell 9 — “Test” by generating text
model.eval()

def generate(prompt, max_new_tokens=60, temperature=0.9, top_p=0.95):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

prompts = [
    "Wikipedia is",
    "In science, the idea of",
    "The meaning of life is",
]

for p in prompts:
    print("=" * 80)
    print("PROMPT:", p)
    print("-" * 80)
    print(generate(p))

PROMPT: Wikipedia is
--------------------------------------------------------------------------------
Wikipedia is of the the the the the which of the " the Bes 
 = = by this. A the " his this @.@ to is of is the the The the The the the the second the the the which a which the In the British and the but @-@ a the her @-@ the other was In
PROMPT: In science, the idea of
--------------------------------------------------------------------------------
In science, the idea of the the a a the the the the his a the the " " on = = = = = = . The A to the the The a it has be in and the the of with the the the the the the the a that at the it ) . 
 = =s , on hes
PROMPT: The meaning of life is
--------------------------------------------------------------------------------
The meaning of life is of the one of the @-@ was and The the A of the a other had the B . It , a the the the D , in the the an the S the the the the the the the the the the a the the the not a other had the the first were of th