<a href="https://colab.research.google.com/github/santthosh/MiniGPT/blob/main/miniGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =========================
#   0. SETUP & OVERVIEW
# =========================

"""
In this notebook, we’ll build a miniGPT model that:
- Loads WikiText-2 from Hugging Face
- Performs character-level tokenization
- Trains a small GPT-like Transformer
- Generates text
"""

!pip install datasets --quiet  # if not already installed
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import time
import random
import numpy as np

from datasets import load_dataset

# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/485.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/194.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hUsing device: cuda


In [3]:
# =========================
#   1. DATA PREPARATION
# =========================

# 1.1 Load the WikiText-2 dataset from Hugging Face
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# We'll focus on the "train" split for a quick example
all_train_lines = dataset['train']['text']

# Join these lines into a single string
text = "\n".join(all_train_lines)

print("Number of lines used:", len(all_train_lines))
print("Length of combined text:", len(text))

# 1.2 Character-Level Tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Unique characters in dataset:", chars)
print("Vocab size:", vocab_size)

char_to_idx = { ch: i for i, ch in enumerate(chars) }
idx_to_char = { i: ch for i, ch in enumerate(chars) }

def encode(s):
    return [char_to_idx[c] for c in s]

def decode(arr):
    return ''.join(idx_to_char[i] for i in arr)

data = torch.tensor(encode(text), dtype=torch.long)
print("Total tokens (characters):", data.size(0))

# 1.3 Train/Validation Split
n = int(0.9 * len(data))  # 90% train, 10% val
train_data = data[:n]
val_data = data[n:]
print("Training data size:", len(train_data))
print("Validation data size:", len(val_data))

Number of lines used: 36718
Length of combined text: 10929707
Unique characters in dataset: ['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '}', '~', '\x93', '\x94', '¡', '¢', '£', '¥', '§', '°', '±', '²', '³', '´', 'µ', '·', 'º', '½', '¿', 'Á', 'Ä', 'Å', 'Æ', 'Ç', 'É', 'Í', 'Î', 'Ñ', 'Ó', 'Ö', '×', 'Ø', 'Ú', 'Ü', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'Ā', 'ā', 'ă', 'ą', 'ć', 'Č', 'č', 'Đ', 'đ', 'Ē', 'ē', 'Ĕ', 'ĕ', 'ė', 'ę', 'ğ', 'ħ', 'ī', 'İ', 'ı', 'Ľ', 'Ł',

In [4]:
# =========================
#   2. HYPERPARAMETERS
# =========================

batch_size = 16       # how many independent sequences will we process in parallel?
block_size = 128      # context length (we can use a larger context than tiny Shakespeare)
embedding_dim = 256   # dimensionality of the character embeddings
n_heads = 8           # number of attention heads
n_layers = 8          # number of transformer blocks
dropout_rate = 0.1
learning_rate = 3e-4
max_iters = 10000      # adjust as needed for demonstration
eval_interval = 200
eval_iters = 50

In [5]:
# =========================
#   3. DATA LOADER
# =========================

def get_batch(split):
    """
    Returns a batch of inputs (x) and targets (y) of shape:
    [batch_size, block_size].

    x, y are shifted by one position:
    x[:,0] -> y[:,1]
    ...
    """
    data_split = train_data if split == 'train' else val_data
    # Random starting indices
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    # Gather block_size tokens for x, and the subsequent tokens for y
    x = torch.stack([data_split[i : i+block_size] for i in ix])
    y = torch.stack([data_split[i+1 : i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

# Quick test
xb, yb = get_batch('train')
print("Input batch shape:", xb.shape)
print("Target batch shape:", yb.shape)
print("Decoded sample input:", decode(xb[0].tolist()))
print("Decoded sample target:", decode(yb[0].tolist()))

Input batch shape: torch.Size([16, 128])
Target batch shape: torch.Size([16, 128])
Decoded sample input: live version of the track featured on the JÄGERMUSIC Rarities 2004 promotional CD , given away to attendees at the Spring 2004 J
Decoded sample target: ive version of the track featured on the JÄGERMUSIC Rarities 2004 promotional CD , given away to attendees at the Spring 2004 Jä


In [6]:
# =========================
#   4. GPT MODEL COMPONENTS
# =========================

class Head(nn.Module):
    """One head of self-attention."""
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(embedding_dim, head_size, bias=False)
        self.query = nn.Linear(embedding_dim, head_size, bias=False)
        self.value = nn.Linear(embedding_dim, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)     # (B, T, head_size)
        q = self.query(x)   # (B, T, head_size)

        # Compute attention weights
        weights = q @ k.transpose(-2, -1) * (1.0 / math.sqrt(k.shape[-1]))  # (B, T, T)
        # Causal mask
        weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        weights = F.softmax(weights, dim=-1)
        weights = self.dropout(weights)

        # Weighted sum of values
        v = self.value(x)   # (B, T, head_size)
        out = weights @ v    # (B, T, head_size)
        return out

class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention in parallel."""
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, embedding_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

class FeedFoward(nn.Module):
    """A simple position-wise feed-forward network."""
    def __init__(self, embedding_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embedding_dim, 4 * embedding_dim),
            nn.ReLU(),
            nn.Linear(4 * embedding_dim, embedding_dim),
            nn.Dropout(dropout_rate)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """Transformer block: Multi-Head Attention + FeedForward + LayerNorm."""
    def __init__(self, embedding_dim, n_heads):
        super().__init__()
        head_size = embedding_dim // n_heads
        self.sa = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedFoward(embedding_dim)
        self.ln1 = nn.LayerNorm(embedding_dim)
        self.ln2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class MiniGPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embedding_dim)
        self.pos_emb = nn.Embedding(block_size, embedding_dim)
        self.blocks = nn.Sequential(
            *[Block(embedding_dim, n_heads) for _ in range(n_layers)]
        )
        self.ln_f = nn.LayerNorm(embedding_dim)
        self.head = nn.Linear(embedding_dim, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # token + positional embeddings
        tok_emb = self.token_emb(idx)  # (B, T, embedding_dim)
        pos = torch.arange(T, device=device).unsqueeze(0)  # shape (1, T)
        pos_emb = self.pos_emb(pos)    # (1, T, embedding_dim)
        x = tok_emb + pos_emb         # (B, T, embedding_dim)

        x = self.blocks(x)
        x = self.ln_f(x)

        logits = self.head(x)  # (B, T, vocab_size)

        loss = None
        if targets is not None:
            # Flatten for cross-entropy
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        """
        Autoregressive text generation given a starting token.
        """
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]  # crop context
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]  # take last time step
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [7]:
# =========================
#   5. TRAINING LOOP
# =========================

model = MiniGPT().to(device)
print("Model Parameter Count:", sum(p.numel() for p in model.parameters())/1e6, "M parameters")

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = []
        for _ in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses.append(loss.item())
        out[split] = np.mean(losses)
    model.train()
    return out

for iter in range(max_iters):

    # Evaluate periodically
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Sample batch
    xb, yb = get_batch('train')

    # Forward & Backprop
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("Training complete!")

Model Parameter Count: 6.864885 M parameters
step 0: train loss 7.0556, val loss 7.0605
step 200: train loss 2.4746, val loss 2.5108
step 400: train loss 2.4146, val loss 2.4284
step 600: train loss 2.3100, val loss 2.3502
step 800: train loss 2.1691, val loss 2.1799
step 1000: train loss 2.0519, val loss 2.0740
step 1200: train loss 1.9787, val loss 1.9870
step 1400: train loss 1.9003, val loss 1.9256
step 1600: train loss 1.8363, val loss 1.8778
step 1800: train loss 1.8064, val loss 1.8408
step 2000: train loss 1.7639, val loss 1.7923
step 2200: train loss 1.7275, val loss 1.7655
step 2400: train loss 1.6760, val loss 1.7401
step 2600: train loss 1.6511, val loss 1.6846
step 2800: train loss 1.6252, val loss 1.6852
step 3000: train loss 1.6078, val loss 1.6599
step 3200: train loss 1.5917, val loss 1.6471
step 3400: train loss 1.5770, val loss 1.6311
step 3600: train loss 1.5667, val loss 1.6164
step 3800: train loss 1.5535, val loss 1.6147
step 4000: train loss 1.5358, val loss 1.5

In [8]:
# =========================
#   6. TEXT GENERATION
# =========================

# Generate text from the trained model
start_context = torch.zeros((1, 1), dtype=torch.long, device=device)  # start token
generated = model.generate(start_context, max_new_tokens=200)[0].tolist()
print("Generated text:")
print(decode(generated))

Generated text:

 The Gocto moxe damage , recent that the final for Third Reyards is importeded in the order , the remined and singles in tissed out of Liuaster Hätt , however - was per mothernmer , particle to the Ha


In [9]:
!pip install -q huggingface_hub
from huggingface_hub import notebook_login

# Log in to Hugging Face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
import torch

# Assuming 'model' is your trained model
model_save_path = "miniGPT.pth"
torch.save(model.state_dict(), model_save_path)

# Save model config
config = {
    "vocab_size": vocab_size,
    "embedding_dim": embedding_dim,
    "n_heads": n_heads,
    "n_layers": n_layers,
    "block_size": block_size,
    "dropout_rate": dropout_rate
}
config_save_path = "config.json"

import json
with open(config_save_path, "w") as f:
    json.dump(config, f)

print("Model and config saved!")

Model and config saved!


In [11]:
from huggingface_hub import HfApi

repo_name = "miniGPT"  # Change this to your model's name
hf_username = "Santthosh"  # Replace with your Hugging Face username

# Create a new repo (only the first time)
api = HfApi()
api.create_repo(repo_id=f"{hf_username}/{repo_name}", exist_ok=True)

# Upload model and config
from huggingface_hub import upload_file

upload_file(
    path_or_fileobj=model_save_path,
    path_in_repo="miniGPT.pth",
    repo_id=f"{hf_username}/{repo_name}",
)

upload_file(
    path_or_fileobj=config_save_path,
    path_in_repo="config.json",
    repo_id=f"{hf_username}/{repo_name}",
)

print(f"Model uploaded to: https://huggingface.co/{hf_username}/{repo_name}")

miniGPT.pth:   0%|          | 0.00/31.8M [00:00<?, ?B/s]

Model uploaded to: https://huggingface.co/Santthosh/miniGPT


In [12]:
from huggingface_hub import hf_hub_download
import torch
import json

repo_id = "Santthosh/miniGPT"

# Load model weights
model_path = hf_hub_download(repo_id=repo_id, filename="miniGPT.pth")
state_dict = torch.load(model_path)

# Load model config
config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
with open(config_path, "r") as f:
    config = json.load(f)

print("Loaded model and config from Hugging Face!")

miniGPT.pth:   0%|          | 0.00/31.8M [00:00<?, ?B/s]

  state_dict = torch.load(model_path)


config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Loaded model and config from Hugging Face!


In [20]:
import torch.nn as nn
import torch

class MiniGPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        embedding_dim = config["embedding_dim"]
        n_heads = config["n_heads"]
        n_layers = config["n_layers"]
        self.token_emb = nn.Embedding(config["vocab_size"], embedding_dim)
        self.pos_emb = nn.Embedding(config["block_size"], embedding_dim)
        self.blocks = nn.Sequential(
            *[Block(embedding_dim, n_heads) for _ in range(n_layers)]
        )
        self.ln_f = nn.LayerNorm(embedding_dim)
        self.head = nn.Linear(embedding_dim, config["vocab_size"])

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_emb(idx)
        pos = torch.arange(T, device=idx.device).unsqueeze(0)
        pos_emb = self.pos_emb(pos)
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits

    def generate(self, idx, max_new_tokens):
        """Generate text autoregressively"""
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -config["block_size"]:]
            logits = self(idx_cond)[:, -1, :]
            probs = torch.nn.functional.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [21]:
# Load the weights

# Create model instance
model = MiniGPT(config)

# Load saved weights
model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))

# Set model to evaluation mode
model.eval()

print("Model loaded successfully!")

Model loaded successfully!


  model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))


In [23]:
import torch

# Reuse your char-level dictionaries and encode/decode functions
# (defined in your data preparation code):
# ------------------------------------------------
# char_to_idx = { ch: i for i, ch in enumerate(chars) }
# idx_to_char = { i: ch for i, ch in enumerate(chars) }

def encode_char_level(text, char_to_idx):
    """Encodes a string into a list of character IDs."""
    return [char_to_idx[c] for c in text]

def decode_char_level(token_ids, idx_to_char):
    """Decodes a list of character IDs back into a string."""
    return ''.join(idx_to_char[i] for i in token_ids)

# Example prompt
prompt = "Hello world"

# Encode the prompt at the character level
input_ids = torch.tensor([encode_char_level(prompt, char_to_idx)], dtype=torch.long)  # shape: (1, seq_len)

# Now generate text with your trained model
# (Assuming 'model' is already defined and loaded)
output_ids = model.generate(
    input_ids,
    max_new_tokens=50,       # how many new tokens to generate
    # Optional parameters like temperature, top_k, etc. can go here
)

# The model outputs shape: (1, original_seq_len + new_tokens)
# Decode the output into a string
generated_text = decode_char_level(output_ids[0].tolist(), idx_to_char)

print("Generated text:", generated_text)

Generated text: Hello world old up to play develop more than 14 @i / 3 ( 1944
