<a href="https://colab.research.google.com/github/santthosh/MiniGPT/blob/main/miniGPT_limited.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =========================
#   0. SETUP & OVERVIEW
# =========================

"""
In this notebook, we’ll build a miniGPT model that:
- Loads WikiText-2 from Hugging Face
- Limits the dataset to 5000 lines for demonstration
- Performs character-level tokenization
- Trains a small GPT-like Transformer
- Generates text
NOTE: Remember to use T4 CUDA (Compute Unified Device Architecture) - Allows GPUs to be used for general purpose computing tasks in parallel
"""

!pip install datasets --quiet  # if not already installed
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import time
import random
import numpy as np

from datasets import load_dataset

# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cpu


In [2]:
# =========================
#   1. DATA PREPARATION
# =========================

# 1.1 Load the WikiText-2 dataset from Hugging Face
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# We'll focus on the "train" split for a quick example
# and limit it to 5000 lines
all_train_lines = dataset['train']['text'][:5000]

# Join these lines into a single string
text = "\n".join(all_train_lines)

print("Number of lines used:", len(all_train_lines))
print("Length of combined text:", len(text))

# 1.2 Character-Level Tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Unique characters in dataset:", chars)
print("Vocab size:", vocab_size)

char_to_idx = { ch: i for i, ch in enumerate(chars) }
idx_to_char = { i: ch for i, ch in enumerate(chars) }

def encode(s):
    return [char_to_idx[c] for c in s]

def decode(arr):
    return ''.join(idx_to_char[i] for i in arr)

data = torch.tensor(encode(text), dtype=torch.long)
print("Total tokens (characters):", data.size(0))

# 1.3 Train/Validation Split
n = int(0.9 * len(data))  # 90% train, 10% val
train_data = data[:n]
val_data = data[n:]
print("Training data size:", len(train_data))
print("Validation data size:", len(val_data))

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Number of lines used: 5000
Length of combined text: 1443537
Unique characters in dataset: ['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '^', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', '£', '°', '²', '´', 'µ', 'º', 'Ä', 'Å', 'Æ', 'É', 'Ñ', 'Ö', '×', 'Ü', 'ß', 'á', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'í', 'ñ', 'ó', 'ô', 'ö', 'ú', 'ü', 'ā', 'ă', 'ą', 'ć', 'č', 'đ', 'ĕ', 'ę', 'ğ', 'Ł', 'ł', 'ń', 'Ō', 'ō', 'ś', 'Š', 'š', 'ū', 'ź', 'Ż', 'ż', 'ə', 'ɡ', 'ɪ', 'ˈ', 'έ', 'ή', 'ί', 'α', 'β', 'γ', 'δ', 'ε', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'ρ', 'ς', 'σ', 'τ', 'υ', 'χ', 'ό', 'К', 'а', 'в', 'з', 'й', 'к', 'н', 'р', 'с', 'ы', 'є', 'ה', 'מ'

| **Hyperparameter**     | **Explanation** |
|------------------------|-----------------|
| `batch_size = 16`      | Train **16 sequences at once** (like baking 16 cookies 🍪 together). |
| `block_size = 64`      | Model remembers **128 words at a time** (like how much text you can recall). |
| `embedding_dim = 128`  | Words are stored in a **256-dimensional space** (higher = better meaning). |
| `n_heads = 4`          | Model looks at **8 different perspectives** (like 8 detectives 🕵️‍♂️ analyzing a crime scene). |
| `n_layers = 4`         | Model has **8 layers** (like a **7-layer cake** 🍰 adding more complexity). |
| `dropout_rate = 0.1`   | **10% of neurons are randomly turned off** to **prevent overfitting**. |
| `learning_rate = 3e-4` | **How fast the model learns** (too fast = crash, too slow = takes forever 🚗). |
| `max_iters = 2000`     | Model will **train for 10,000 steps** (like practicing basketball 🏀). |
| `eval_interval = 200`  | **Model takes a test every 200 steps** to check learning progress. |
| `eval_iters = 50`      | Each test uses **50 different samples** for accuracy. |

In [3]:
# =========================
#   2. HYPERPARAMETERS
# =========================

batch_size = 16       # how many independent sequences will we process in parallel?
block_size = 64       # context length (we can use a larger context than tiny Shakespeare)
embedding_dim = 128   # dimensionality of the character embeddings
n_heads = 4           # number of attention heads
n_layers = 4          # number of transformer blocks
dropout_rate = 0.1
learning_rate = 3e-4
max_iters = 2000      # adjust as needed for demonstration
eval_interval = 200
eval_iters = 50

In [4]:
# =========================
#   3. DATA LOADER
# =========================

def get_batch(split):
    """
    Returns a batch of inputs (x) and targets (y) of shape:
    [batch_size, block_size].

    x, y are shifted by one position:
    x[:,0] -> y[:,1]
    ...
    """
    data_split = train_data if split == 'train' else val_data
    # Random starting indices
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    # Gather block_size tokens for x, and the subsequent tokens for y
    x = torch.stack([data_split[i : i+block_size] for i in ix])
    y = torch.stack([data_split[i+1 : i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

# Quick test
xb, yb = get_batch('train')
print("Input batch shape:", xb.shape)
print("Target batch shape:", yb.shape)
print("Decoded sample input:", decode(xb[0].tolist()))
print("Decoded sample target:", decode(yb[0].tolist()))

Input batch shape: torch.Size([16, 64])
Target batch shape: torch.Size([16, 64])
Decoded sample input: oorkeeper with two attendant demons . At the north end is a stan
Decoded sample target: orkeeper with two attendant demons . At the north end is a stand


In [5]:
# =========================
#   4. GPT MODEL COMPONENTS
# =========================

# What’s Happening?
# 	1.	You give it some words (as numbers, of course)
# 	    Input shape is (B, T, C) →
# 	        B = batch size (how many sentences at once)
# 	        T = how many words in each sentence
# 	        C = size of each word’s representation (embedding size)
# 	2.	It turns those words into 3 special versions:
# 	        Key (k) → What the word means
# 	        Query (q) → What the word is looking for
# 	        Value (v) → What info the word carries
# 	3.	It calculates “how much should each word pay attention to every other word?”
# 	        This is done with q @ k.T → a big table of attention scores.
# 	4.	It blocks words from seeing the future (important for making predictions one word at a time)
# 	        Uses a triangular mask so a word can’t peek ahead.
# 	5.	It turns the scores into probabilities (softmax)
# 	        This makes sure all attention weights add up to 1.
# 	6.	It uses these scores to mix words together
# 	        Each word gets a new value by combining all the words it “pays attention to.”
class Head(nn.Module):
    """One head of self-attention."""
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(embedding_dim, head_size, bias=False)
        self.query = nn.Linear(embedding_dim, head_size, bias=False)
        self.value = nn.Linear(embedding_dim, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B, T, head_size)
        q = self.query(x)   # (B, T, head_size)

        # Compute attention weights
        weights = q @ k.transpose(-2, -1) * (1.0 / math.sqrt(k.shape[-1]))  # (B, T, T)
        # Causal mask
        weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        weights = F.softmax(weights, dim=-1)
        weights = self.dropout(weights)

        # Weighted sum of values
        v = self.value(x)   # (B, T, head_size)
        out = weights @ v    # (B, T, head_size)
        return out

# What is happening?
# 	•	Runs multiple attention heads in parallel → each looks at the input differently.
# 	•	Stacks outputs from all heads into one big tensor.
# 	•	Uses a linear layer to combine and resize back to the original embedding size.
# 	•	Applies dropout to prevent overfitting.
# 	•	Why? Multiple heads let the model focus on different aspects of the input at the same time, making it smarter!
class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention in parallel."""
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, embedding_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

# Simple feed forward Neural Network 
class FeedFoward(nn.Module):
    """A simple position-wise feed-forward network."""
    def __init__(self, embedding_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embedding_dim, 4 * embedding_dim),
            nn.ReLU(),
            nn.Linear(4 * embedding_dim, embedding_dim),
            nn.Dropout(dropout_rate)
        )

    def forward(self, x):
        return self.net(x)

# What is happening here?
# 	Step 1: Look around and decide which words matter.
# 	Step 2: Think deeper about each word.
# 	Step 3: Make sure everything is balanced.
class Block(nn.Module):
    """Transformer block: Multi-Head Attention + FeedForward + LayerNorm."""
    def __init__(self, embedding_dim, n_heads):
        super().__init__()
        head_size = embedding_dim // n_heads
        self.sa = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedFoward(embedding_dim)
        self.ln1 = nn.LayerNorm(embedding_dim)
        self.ln2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class MiniGPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embedding_dim)        # Turn tokens/characters into numbers
        self.pos_emb = nn.Embedding(block_size, embedding_dim)          # Add position info (Where is the token/character/word)
        self.blocks = nn.Sequential(                                    # Stack multiple transformer blocks - More blocks == smarter the model
            *[Block(embedding_dim, n_heads) for _ in range(n_layers)]
        )
        self.ln_f = nn.LayerNorm(embedding_dim)                         # Keeps values stable
        self.head = nn.Linear(embedding_dim, vocab_size)                # Final layer for converingt 

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # token + positional embeddings
        tok_emb = self.token_emb(idx)  # (B, T, embedding_dim)
        pos = torch.arange(T, device=device).unsqueeze(0)  # shape (1, T)
        pos_emb = self.pos_emb(pos)    # (1, T, embedding_dim)
        x = tok_emb + pos_emb         # (B, T, embedding_dim)

        x = self.blocks(x)
        x = self.ln_f(x)

        logits = self.head(x)  # (B, T, vocab_size)                     # logits the prediction for next word,

        loss = None
        if targets is not None:
            # Flatten for cross-entropy
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)                     # how wrong the model is 

        return logits, loss          

    def generate(self, idx, max_new_tokens):
        """
        Autoregressive text generation given a starting token.
        """
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]  # crop context
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]  # take last time step
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [6]:
# =========================
#   5. TRAINING LOOP
# =========================

model = MiniGPT().to(device)
print("Model Parameter Count:", sum(p.numel() for p in model.parameters())/1e6, "M parameters")

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = []
        for _ in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses.append(loss.item())
        out[split] = np.mean(losses)
    model.train()
    return out

for iter in range(max_iters):

    # Evaluate periodically
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Sample batch
    xb, yb = get_batch('train')

    # Forward & Backprop
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("Training complete!")

Model Parameter Count: 0.877357 M parameters
step 0: train loss 5.9110, val loss 5.9135
step 200: train loss 2.6043, val loss 2.6094
step 400: train loss 2.4784, val loss 2.4854
step 600: train loss 2.3979, val loss 2.4111
step 800: train loss 2.3401, val loss 2.3457
step 1000: train loss 2.2561, val loss 2.2723
step 1200: train loss 2.2064, val loss 2.2166
step 1400: train loss 2.1504, val loss 2.1812
step 1600: train loss 2.0928, val loss 2.1306
step 1800: train loss 2.0543, val loss 2.0994
Training complete!


In [10]:
# =========================
#   6. TEXT GENERATION
# =========================

# Generate text from the trained model
start_context = torch.zeros((1, 1), dtype=torch.long, device=device)  # start token
generated = model.generate(start_context, max_new_tokens=200)[0].tolist()
print("Generated text:")
print(decode(generated))

Generated text:



R Karda Cordion catuon shames to shoblehe stoStrals warl the uncont to makic midedeiro anlitios Awas uncery of Prumed Pemonideds bul to jeaver ans thee stighided an lockmeagut dectand . 

 L L anyma
