<a href="https://colab.research.google.com/github/snpsuen/Deep_Learning_Data/blob/main/script/MiniGPT_example03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://www.gutenberg.org/cache/epub/1504/pg1504.txt -O corpus.txt
import torch
import torch.nn as nn
import torch.nn.functional as F
import re

# --- Config ---
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size, block_size = 16, 64
max_iters, eval_interval = 500, 100
lr, n_embd, n_head, n_layer = 1e-3, 128, 4, 2
dropout = 0.1

--2025-08-08 19:12:51--  https://www.gutenberg.org/cache/epub/1504/pg1504.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 112380 (110K) [text/plain]
Saving to: ‘corpus.txt’


2025-08-08 19:12:51 (1.36 MB/s) - ‘corpus.txt’ saved [112380/112380]



In [2]:
# --- Load and tokenize corpus (word-level) ---
with open('corpus.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Tokenize into words and punctuation
words = re.findall(r"\b\w+\b|[^\w\s]", text)
vocab = sorted(set(words))
vocab_size = len(vocab)
stoi = {w:i for i,w in enumerate(vocab)}
itos = {i:w for w,i in stoi.items()}
encode = lambda s: [stoi[w] for w in re.findall(r"\b\w+\b|[^\w\s]", s) if w in stoi]
decode = lambda idxs: ' '.join([itos[i] for i in idxs])

print(f"type(words) = {type(words)}")
print(f"Vocab size (word-level): {vocab_size}")

# Convert full corpus to token IDs
data = torch.tensor(encode(text), dtype=torch.long)
x, y = data[:-1], data[1:]

print(f"type(encode(text)) = {type(encode(text))}")
print(f"type(data) = {type(data)}")
print(f"data.shape = {data.shape}")

# --- Batching ---
def get_batch():
    ix = torch.randint(len(x) - block_size, (batch_size,))
    xb = torch.stack([x[i:i+block_size] for i in ix]).to(device)
    yb = torch.stack([y[i:i+block_size] for i in ix]).to(device)
    return ix, xb, yb

ix, xb, yb = get_batch()
print(f"ix.shape = {ix.shape}")
print(f"xb.shape = {xb.shape}")
print(f"yb.shape = {yb.shape}")
print("ix = ", ix)
print("xb = ", xb)
print("yb = ", yb)


type(words) = <class 'list'>
Vocab size (word-level): 3387
type(encode(text)) = <class 'list'>
type(data) = <class 'torch.Tensor'>
data.shape = torch.Size([24688])
ix.shape = torch.Size([16])
xb.shape = torch.Size([16, 64])
yb.shape = torch.Size([16, 64])
ix =  tensor([10056, 15887, 15479,  4715, 21567, 16465, 16945, 17684, 15124,  8206,
        10305,  3509, 16407, 21051, 16715, 19036])
xb =  tensor([[2148, 1819, 1664,  ...,   41,  180,  427],
        [ 772, 1820, 1472,  ..., 3043, 3000,    0],
        [2653,    9,  446,  ...,    9,  316, 1303],
        ...,
        [ 316,  772,  696,  ..., 3035,    7, 2015],
        [2898,    9,  375,  ...,    7,  781,  872],
        [ 952, 2141, 3036,  ..., 1738, 1921,    9]], device='cuda:0')
yb =  tensor([[1819, 1664, 1836,  ...,  180,  427,  508],
        [1820, 1472, 3381,  ..., 3000,    0,   46],
        [   9,  446,    9,  ...,  316, 1303, 2269],
        ...,
        [ 772,  696, 2982,  ...,    7, 2015, 3054],
        [   9,  375,    9,  ..., 

In [3]:
# --- Model Components ---
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v

class MultiHead(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = n_embd // n_head
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_head)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.dropout(self.proj(torch.cat([h(x) for h in self.heads], dim=-1)))

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        self.sa = MultiHead()
        self.ff = FeedForward()

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

class MiniLLM(nn.Module):
    def __init__(self):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, n_embd)
        self.pos_emb = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets, debug_print):
        B, T = idx.size()
        tok = self.tok_emb(idx)
        pos = self.pos_emb(torch.arange(T, device=device))
        x = tok + pos
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        loss = F.cross_entropy(logits.view(-1, vocab_size), targets.view(-1)) if targets is not None else None

        if debug_print == 1:
            print("logits.shape = ", logits.shape)
            print("targets.shape = ", targets.shape)
            print("logits.view(-1, vocab_size).shape = ", logits.view(-1, vocab_size).shape)
            print("targets.view(-1).shape = ", targets.view(-1).shape)

        return logits, loss

    def generate(self, idx, max_new):
        for _ in range(max_new):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond, None, 0)
            probs = F.softmax(logits[:, -1], dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], dim=1)
        return idx

# --- Initialize model ---
model = MiniLLM().to(device)
opt = torch.optim.AdamW(model.parameters(), lr=lr)

In [4]:
# --- Training ---
first = 1
for it in range(max_iters):
    ix, xb, yb = get_batch()
    if first == 1:
        first = 0
        _, loss = model(xb, yb, 1)
    else:
        _, loss = model(xb, yb, 0)
    opt.zero_grad()
    loss.backward()
    opt.step()
    if it % eval_interval == 0:
        print(f"Iter {it} | Loss: {loss.item():.4f}")

logits.shape =  torch.Size([16, 64, 3387])
targets.shape =  torch.Size([16, 64])
logits.view(-1, vocab_size).shape =  torch.Size([1024, 3387])
targets.view(-1).shape =  torch.Size([1024])
Iter 0 | Loss: 8.2586
Iter 100 | Loss: 5.3160
Iter 200 | Loss: 4.6808
Iter 300 | Loss: 4.0098
Iter 400 | Loss: 3.6249


In [5]:
# --- Interactive Prompt ---
print("\n🎭 MiniLLM Interactive Mode (word-level) — type 'exit' to quit.")
while True:
    prompt = input("\nYou > ").strip()
    if prompt.lower() in ['exit', 'quit']:
        print("Goodbye!")
        break
    if not prompt:
        continue
    try:
        print("type(encode(prompt)) = ", type(encode(prompt)))
        print("encode(prompt) = ", encode(prompt))
        context = torch.tensor([encode(prompt)], dtype=torch.long).to(device)
        print("context.shape = ", context.shape)
    except Exception as e:
        print(f"⚠️ Error: {e}")
        continue
    out = model.generate(context, max_new=50)[0]
    result = decode(out.tolist())
    print("\nMiniLLM >", result[len(prompt.split()):])



🎭 MiniLLM Interactive Mode (word-level) — type 'exit' to quit.

You > A long long time ago
type(encode(prompt)) =  <class 'list'>
encode(prompt) =  [42, 2072, 2072, 3072]
context.shape =  torch.Size([1, 4])

MiniLLM > g long time so ourselves , or remove at death and be dry . Proceed , one the lapwing fellow U . ADRIANA . _ ] SCENE I had not feel this service what stuff ” set forth in inn . Let whom I that have the man , including legal this sympathised

You > Once upon a time
type(encode(prompt)) =  <class 'list'>
encode(prompt) =  [437, 3175, 696, 3072]
context.shape =  torch.Size([1, 4])

MiniLLM >  upon a time by by e to charge for it , and proofread them on which light . His dines , displaying the law well accepted in paragraph 1 . full Project Gutenberg ’ s copy I tell me . A needy , but . E . Let him . E . E

You > Tonight is what it means to be young
type(encode(prompt)) =  <class 'list'>
encode(prompt) =  [1951, 3281, 1952, 2146, 3078, 869]
context.shape =  torch.Size([1, 6])