In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import requests


In [None]:
# ------------------------------------------
# 1️⃣ Load and prepare dataset
# ------------------------------------------
url = "https://www.gutenberg.org/files/11/11-0.txt"  # Alice in Wonderland
text = requests.get(url).text.lower()
print("Sample text:\n", text[:500])

In [5]:
# Simple character-level tokenization
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(chars)
print("Vocab size:", vocab_size)

['\n', ' ', '!', '(', ')', '*', ',', '-', '.', '0', '1', '3', ':', ';', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ù', '—', '‘', '’', '“', '”']
Vocab size: 50


In [9]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}
def encode(s):
    return [stoi[c] for c in s]
def decode(l):
    return ''.join([itos[i] for i in l])
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [10]:
# ------------------------------------------
# 2️⃣ Define the model
# ------------------------------------------
class TinyGPT(nn.Module):
    def __init__(self, vocab_size, n_embd=64, n_head=4, n_layer=2, block_size=128):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.pos_embedding = nn.Embedding(block_size, n_embd)
        self.blocks = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=n_embd, nhead=n_head)
            for _ in range(n_layer)
        ])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size
    def forward(self, idx, targets=None):
        B, T = idx.size()
        token_emb = self.token_embedding(idx)
        pos = torch.arange(0, T, device=idx.device).unsqueeze(0)
        pos_emb = self.pos_embedding(pos)
        x = token_emb + pos_emb
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.head(x)
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
            return logits, loss
        return logits


In [11]:
# ------------------------------------------
# 3️⃣ Initialize model & training settings
# ------------------------------------------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = TinyGPT(vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
block_size = 128
batch_size = 64

In [13]:
# ------------------------------------------
# 4️⃣ Training loop
# ------------------------------------------
def get_batch(split):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

for step in tqdm(range(1000)):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 100 == 0:
        print(f"Step {step} | Loss: {loss.item():.4f}")


  0%|          | 1/1000 [00:00<16:05,  1.03it/s]

Step 0 | Loss: 4.0584


 10%|█         | 101/1000 [01:46<15:17,  1.02s/it]

Step 100 | Loss: 2.6255


 20%|██        | 201/1000 [03:34<11:18,  1.18it/s]

Step 200 | Loss: 2.5219


 30%|███       | 301/1000 [05:08<11:32,  1.01it/s]

Step 300 | Loss: 2.4799


 40%|████      | 401/1000 [06:50<13:53,  1.39s/it]

Step 400 | Loss: 2.4204


 50%|█████     | 501/1000 [08:39<09:26,  1.14s/it]

Step 500 | Loss: 2.4511


 60%|██████    | 601/1000 [10:29<07:26,  1.12s/it]

Step 600 | Loss: 2.4399


 70%|███████   | 701/1000 [12:15<05:02,  1.01s/it]

Step 700 | Loss: 2.4330


 80%|████████  | 801/1000 [14:02<03:29,  1.05s/it]

Step 800 | Loss: 2.4074


 90%|█████████ | 901/1000 [15:55<02:01,  1.22s/it]

Step 900 | Loss: 2.4240


100%|██████████| 1000/1000 [17:41<00:00,  1.06s/it]


In [17]:
# ------------------------------------------
# 5️⃣ Text generation
# ------------------------------------------
model.eval()
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = []

for _ in range(500):

    # Keep only the last block_size tokens
    if context.size(1) > model.block_size:
        context = context[:, -model.block_size:]

    logits = model(context)

    # last time step
    logits_last = logits[:, -1, :]

    probs = F.softmax(logits_last, dim=-1)

    next_id = torch.multinomial(probs, num_samples=1)

    context = torch.cat((context, next_id), dim=1)

    generated.append(next_id.item())

print(decode(generated))

osouiin s wn woh  camin hiof  t oinaracs ito aragr th’d t psitesof lil t andy te s caiti s. tonurooup asthane,tt ad housldi d a as s arit d,oolasonds teelin tp woo t t t’t t!t s s i shetor athatero p,velid n nden  w weake do,ter maltt t ad t se no as bse w totg di sera d urerus juderaborsoutal tend in lro iman.e
 iceldu mml tm-cade rm mp, and t hith bre
n the d mkincy bee aceitrele oug tcecadriseneo y at owerer,  oeis w wh st o h so at ng santseu as lo yo o or h ore o se tw te m le id ur touond 
