# MiniGPT Colab Notebook

This notebook builds a **small GPT-style language model** from scratch (causal / autoregressive).

It tries to load text from a local uploaded file **(if present)** and falls back to a public domain book URL.

**Local uploaded file path included (per your session):**

`/mnt/data/A_2D_digital_illustration_diagram_depicts_an_Artif.png`

If the local file is not a text file, the notebook will automatically use a Gutenberg URL (Alice in Wonderland) as default.

Run the cells sequentially. Adjust hyperparameters in the `Config` class to scale up/down.

In [None]:
# Install dependencies (Colab-friendly)
!pip install torch tqdm requests --quiet

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import requests
print('Torch version:', torch.__version__)


In [None]:
# ---------------------------
# 1) Load text dataset
# ---------------------------
LOCAL_FILE = r"/mnt/data/A_2D_digital_illustration_diagram_depicts_an_Artif.png"
GUTENBERG_URL = 'https://www.gutenberg.org/files/11/11-0.txt'  # fallback (Alice in Wonderland)

text = None
if os.path.exists(LOCAL_FILE):
    try:
        with open(LOCAL_FILE, 'r', encoding='utf-8') as f:
            text = f.read()
        print('Loaded local file as text from:', LOCAL_FILE)
    except Exception as e:
        print('Local file exists but could not be read as text (will use Gutenberg). Error:', e)

if text is None:
    print('Downloading fallback text from Gutenberg...')
    text = requests.get(GUTENBERG_URL).text

text = text.replace('\r', '\n')
print('\nSample (first 400 chars):\n')
print(text[:400])


In [None]:
# ---------------------------
# 2) Tokenization (character-level for learning)
# ---------------------------
chars = sorted(list(set(text)))
vocab_size = len(chars)
print('Vocab size (characters):', vocab_size)

stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for ch,i in stoi.items()}

def encode(s): return [stoi[c] for c in s]
def decode(l): return ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]
print('Train tokens:', len(train_data), 'Val tokens:', len(val_data))


In [None]:
# ---------------------------
# 3) MiniGPT implementation (causal)
# ---------------------------
import math

class Config:
    vocab_size = len(chars)
    block_size = 128
    n_layer = 4
    n_head = 4
    n_embd = 256
    dropout = 0.1

cfg = Config()

def causal_mask(T, device):
    mask = torch.tril(torch.ones(T, T, device=device)).unsqueeze(0)
    mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

class GPTBlock(nn.Module):
    def __init__(self, n_embd, n_head, dropout):
        super().__init__()
        assert n_embd % n_head == 0
        self.ln1 = nn.LayerNorm(n_embd)
        self.attn = nn.MultiheadAttention(embed_dim=n_embd, num_heads=n_head, dropout=dropout, batch_first=True)
        self.ln2 = nn.LayerNorm(n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )
    def forward(self, x, attn_mask=None):
        x_norm = self.ln1(x)
        attn_out, _ = self.attn(x_norm, x_norm, x_norm, attn_mask=attn_mask)
        x = x + attn_out
        x = x + self.mlp(self.ln2(x))
        return x

class MiniGPT(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.vocab_size = cfg.vocab_size
        self.block_size = cfg.block_size
        self.n_embd = cfg.n_embd
        self.tok_emb = nn.Embedding(self.vocab_size, self.n_embd)
        self.pos_emb = nn.Embedding(self.block_size, self.n_embd)
        self.drop = nn.Dropout(cfg.dropout)
        self.blocks = nn.ModuleList([GPTBlock(self.n_embd, cfg.n_head, cfg.dropout) for _ in range(cfg.n_layer)])
        self.ln_f = nn.LayerNorm(self.n_embd)
        self.head = nn.Linear(self.n_embd, self.vocab_size, bias=False)
        self.head.weight = self.tok_emb.weight
        self._init_weights()
    def _init_weights(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.normal_(p, mean=0.0, std=0.02)
    def forward(self, idx, targets=None):
        B, T = idx.size()
        if T > self.block_size:
            raise ValueError(f"Cannot forward: sequence length T={T} > block_size={self.block_size}")
        device = idx.device
        tok = self.tok_emb(idx)
        pos = self.pos_emb(torch.arange(T, device=device)).unsqueeze(0)
        x = self.drop(tok + pos)
        attn_mask = causal_mask(T, device=device).squeeze(0)
        for block in self.blocks:
            x = block(x, attn_mask=attn_mask)
        x = self.ln_f(x)
        logits = self.head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
            return logits, loss
        return logits

    @torch.no_grad()
    def generate(self, idx, max_new_tokens=200, temperature=1.0, top_k=None):
        self.eval()
        device = next(self.parameters()).device
        idx = idx.to(device)
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits = self(idx_cond)
            logits_last = logits[:, -1, :]
            logits_last = logits_last / (temperature if temperature > 0 else 1.0)
            if top_k is not None:
                v, _ = torch.topk(logits_last, min(top_k, logits_last.size(-1)))
                min_topk = v[:, -1].unsqueeze(-1)
                logits_last = torch.where(logits_last < min_topk, torch.full_like(logits_last, -float('Inf')), logits_last)
            probs = F.softmax(logits_last, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, next_token), dim=1)
        return idx


In [None]:
# ---------------------------
# 4) Training loop (small demo)
# ---------------------------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = MiniGPT(cfg).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)

batch_size = 64
block_size = cfg.block_size

def get_batch(split):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(0, len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i+block_size] for i in ix]).to(device)
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix]).to(device)
    return x, y

steps = 1000
model.train()
for step in tqdm(range(steps)):
    xb, yb = get_batch('train')
    logits, loss = model(xb, targets=yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if step % 100 == 0:
        print(f"step {step} loss {loss.item():.4f}")


In [None]:
# ---------------------------
# 5) Generation example
# ---------------------------
model.eval()
prompt = 'The rabbit'
context = torch.tensor([encode(prompt)], dtype=torch.long).to(device)
out = model.generate(context, max_new_tokens=300, temperature=0.8, top_k=40)
generated_text = decode(out[0].tolist())
print('\n=== Generated Text ===\n')
print(generated_text)
