<a href="https://colab.research.google.com/github/safaet/mufti-llm/blob/main/LLM_PathwayCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Text -> Tokenize -> Embed -> Transformer -> Predict next token**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# ------------------------------
# 1. TEXT & TOKENIZATION
# ------------------------------

text = "hello world Chattogram bangladesh"
chars = sorted(set(text))
vocab_size = len(chars)

# Mappings: char â†” index
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s): return [stoi[c] for c in s]
def decode(ids): return ''.join([itos[i] for i in ids])

# Example data
data = torch.tensor(encode(text), dtype=torch.long)

# ------------------------------
# 2. MODEL DEFINITION
# ------------------------------

class MiniTransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim)
        )
        self.ln2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_output, _ = self.attn(x, x, x, need_weights=False)
        x = self.ln1(x + attn_output)
        x = self.ln2(x + self.ff(x))
        return x

class MiniLLM(nn.Module):
    def __init__(self, vocab_size, embed_dim=32, block_size=8, num_heads=2):
        super().__init__()
        self.embed_dim = embed_dim
        self.block_size = block_size

        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(block_size, embed_dim)

        self.transformer = MiniTransformerBlock(embed_dim, num_heads)
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, idx):
        B, T = idx.shape
        token_embed = self.token_embedding(idx)
        pos = torch.arange(T, device=idx.device)
        pos_embed = self.position_embedding(pos)
        x = token_embed + pos_embed

        x = self.transformer(x)
        logits = self.lm_head(x)
        return logits

# ------------------------------
# 3. TRAINING SETUP
# ------------------------------

model = MiniLLM(vocab_size)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
loss_fn = nn.CrossEntropyLoss()

def get_batch(seq_len=8):
    start = torch.randint(0, len(data) - seq_len - 1, (1,)).item()
    x = data[start:start+seq_len]
    y = data[start+1:start+seq_len+1]
    return x.unsqueeze(0), y.unsqueeze(0)  # (B=1, T)

# ------------------------------
# 4. TRAINING LOOP
# ------------------------------

for step in range(500):
    x, y = get_batch()
    logits = model(x)
    loss = loss_fn(logits.view(-1, vocab_size), y.view(-1))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 50 == 0:
        print(f"Step {step}: loss = {loss.item():.4f}")

# ------------------------------
# 5. TEXT GENERATION
# ------------------------------

def generate(model, start_text='h', max_new_tokens=20):
    model.eval()
    idx = torch.tensor([encode(start_text)], dtype=torch.long)

    for _ in range(max_new_tokens):
        idx_crop = idx[:, -model.block_size:]
        logits = model(idx_crop)
        logits = logits[:, -1, :]  # last time step
        probs = F.softmax(logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)
        idx = torch.cat([idx, next_id], dim=1)

    return decode(idx[0].tolist())

print("\nGenerated Text:")
print(generate(model, start_text="h"))


Step 0: loss = 2.5907
Step 50: loss = 0.4702
Step 100: loss = 0.4357
Step 150: loss = 0.0509
Step 200: loss = 0.2567
Step 250: loss = 0.0602
Step 300: loss = 0.3255
Step 350: loss = 0.3148
Step 400: loss = 0.2262
Step 450: loss = 0.8030

Generated Text:
hanglattttograd Chatt
