In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.nn import functional as F 

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
import sentencepiece as spm

spm.SentencePieceTrainer.train(
    input='input.txt',
    model_prefix='mymodel',
    vocab_size=10000,
    model_type='bpe'  # or 'unigram', 'char', 'word'
)


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: input.txt
  input_format: 
  model_prefix: mymodel
  model_type: BPE
  vocab_size: 10000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential

e_model_trainer.cc(268) LOG(INFO) Added: freq=3 size=8620 all=21531 active=1065 piece=▁rabble
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3 size=8640 all=21516 active=1050 piece=▁sheets
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3 size=8660 all=21500 active=1034 piece=▁talked
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3 size=8680 all=21489 active=1023 piece=▁verity
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3 size=8700 all=21483 active=1017 piece=▁Bristol
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=3 min_freq=2
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3 size=8720 all=21463 active=1055 piece=▁Tarquin
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3 size=8740 all=21444 active=1036 piece=▁bewitch
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3 size=8760 all=21428 active=1020 piece=▁crowned
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3 size=8780 all=21417 active=1009 piece=▁fitting
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=3 

In [5]:
import sentencepiece as spm

sp = spm.SentencePieceProcessor()
sp.load("mymodel.model")

True

In [6]:
text = "I love natural language processing."
ids = sp.encode(text, out_type=int)
pieces = sp.encode(text, out_type=str)

print("IDs:", ids)
print("Pieces:", pieces)

IDs: [19, 277, 3901, 3915, 5397, 47, 9965]
Pieces: ['▁I', '▁love', '▁natural', '▁language', '▁process', 'ing', '.']


In [7]:
sp.decode(19)

'I'

In [8]:
file = open("input.txt","r")
text = file.read()

In [9]:
data = torch.tensor(sp.encode(text), dtype=torch.long)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [85]:
# data loading
batch_size = 64
block_size = 60
vocab_size = 10000
embed_dim = 128
n_head = 4
n_layer = 4
dropout=0.2

In [68]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    x, y = x.to(DEVICE), y.to(DEVICE)
    return x, y

In [69]:
e = nn.Embedding(vocab_size, embed_dim)
e(train_data[:10])

tensor([[-0.2773, -1.8308,  2.1722,  0.5854,  1.8425, -0.7005, -0.5679, -0.2311,
          0.0426,  1.8937, -1.8211, -1.3948,  0.7405, -0.3848, -0.4306, -0.1600,
         -0.3474, -0.2074,  0.4960, -0.1172, -1.4797,  0.3060,  2.7808, -1.5915,
         -1.1642, -0.5405,  0.1390, -1.1464, -0.1799,  1.8082, -0.2283,  0.5431,
         -0.8814, -0.2258, -1.2358, -1.1219,  0.0656,  0.3867, -0.2485,  0.4815,
          0.7332, -1.4192, -1.3743, -2.3098,  1.1624,  1.6285, -0.4187,  0.1205,
         -0.5820, -0.2939, -0.6509, -0.7882, -1.0821,  0.4707, -0.4898,  1.3751,
          1.5797,  1.4576,  1.1939, -0.2845,  0.2849,  1.1436,  2.1554, -0.1997],
        [ 0.2601,  0.3294, -0.5075,  0.6640,  1.3708,  0.1140,  0.7763, -1.8856,
          1.4043,  1.1260, -0.9977,  0.7759, -1.2463, -0.7464, -2.2910, -0.0657,
          0.4381, -0.3729,  0.9118, -0.5077, -1.3499,  1.1041, -1.0829,  0.8855,
          1.9238, -0.4603,  0.1698, -1.0827,  0.5084,  0.0404, -0.3700, -1.2923,
          0.1278, -1.6385, 

## 📘 What is `exp`?

`exp(x)` stands for the **exponential function**:

\[
\exp(x) = e^x
\]

Where:
- \( e \) is **Euler's number**, approximately **2.71828**
- So:
  - \(\exp(1) = e^1 \approx 2.718\)
  - \(\exp(0) = e^0 = 1\)
  - \(\exp(-1) = e^{-1} \approx 0.367\)

---

## ✅ Why is `exp` important?

The exponential function is essential in both math and machine learning. Here are some key reasons:

### 1. **Inverse of log**:

\[
\exp(\log(x)) = x \quad\text{and}\quad \log(\exp(x)) = x
\]

It helps convert between **logarithmic space** and **linear space**.

---

### 2. **Used in Softmax**:

Softmax uses `exp(x)` to turn numbers into probabilities:

\[
\text{Softmax}(x_i) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
\]

---

### 3. **Used in Positional Encoding (Transformers)**:

In the Transformer model, we scale the position by frequency using:

\[
\frac{\text{pos}}{10000^{\frac{2i}{d_{\text{model}}}}}
\]

This can be rewritten using `exp`:

\[
10000^{-\frac{2i}{d_{\text{model}}}} = \exp\left(-\log(10000) \cdot \frac{2i}{d_{\text{model}}} \right)
\]

This is more numerically stable in code.

---

## 🔢 Python Example

```python
import math

print(math.exp(1))   # ≈ 2.718
print(math.exp(0))   # = 1
print(math.exp(-1))  # ≈ 0.367


In [70]:
position = torch.arange(0, 30, dtype=torch.float).unsqueeze(0)

print(position)

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
         14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27.,
         28., 29.]])


In [71]:
import math
torch.exp(torch.arange(0, 10, 2).float() * (-math.log(10000.0) / 64))

tensor([1.0000, 0.7499, 0.5623, 0.4217, 0.3162])

In [72]:
import torch
import torch.nn as nn
import math


class PositionalEncoding(nn.Module):
    def __init__(self, d_model=64, max_len=30):
        super(PositionalEncoding, self).__init__()

        # Create a matrix of shape (max_len, d_model)
        pe = torch.zeros(max_len, d_model)

        # Position indices (0 to max_len-1)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)

        # Compute the div term: 10000^(2i/d_model)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )

        # Apply sine to even indices in the array; cosine to odd indices
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        # Add a batch dimension and register as buffer so it won't update during training
        pe = pe.unsqueeze(0)  # Shape: (1, max_len, d_model)
        self.register_buffer("pe", pe)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, seq_len, d_model)
        Returns:
            Tensor with positional encoding added
        """
        x = x + self.pe[:, : x.size(1), :]
        return x

In [73]:
x = torch.arange(10).float()

In [74]:
torch.tril(torch.ones(8,8))

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [75]:
torch.cat([a,a],dim=-1)

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0.]])

In [77]:
class Head(nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.q = nn.Linear(embed_dim,head_size,bias=False)
        self.k = nn.Linear(embed_dim,head_size,bias=False)
        self.v = nn.Linear(embed_dim,head_size,bias=False)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

    def forward(self,x):
        q = self.q(x)
        k = self.k(x)
        v = self.v(x)

        B,T,C = x.shape

        wei:torch.tensor = q @ k.transpose(-2,-1) / block_size**0.5
        wei = wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
        wei = F.softmax(wei,-1)
        wei = self.dropout(wei)
        out = wei @ v
        return out

        


In [78]:
class MultiHeadAttention(nn.Module):
    def __init__(self,n_head,head_size):
        super().__init__()
        self.head = nn.ModuleList([Head(head_size) for _ in range(n_head)])
        self.proj = nn.Linear(n_head*head_size,embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.head], dim=-1)
        out = self.dropout(self.proj(out))
        return out


In [79]:
class FeedFoward(nn.Module):
    """a simple linear layer followed by a non-linearity"""

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [80]:
class Block(nn.Module):
    """Transformer block: communication followed by computation"""

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [81]:
class DecoderBlock(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,embed_dim)
        self.position_encoding = PositionalEncoding(d_model=embed_dim,max_len=block_size)
        self.blocks = nn.Sequential(
            *[Block(embed_dim, n_head=n_head) for _ in range(n_layer)]
        )
        self.ln_f = nn.LayerNorm(embed_dim)  # final layer norm
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self,idx,targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.embedding(idx)  # (B,T,C)
        pos_emb = self.position_encoding(tok_emb)  # (T,C)
        x = self.blocks(pos_emb)  # (B,T,C)
        x = self.ln_f(x)  # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)


        return logits, loss
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [82]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [86]:
model = DecoderBlock()
m = model.to(DEVICE)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters()) / 1e6, "M parameters")

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
max_iters = 5000
eval_interval = 500
eval_iters = 200

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(
            f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}"
        )

    # sample a batch of data
    xb, yb = get_batch("train")

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

3.361808 M parameters
step 0: train loss 9.3624, val loss 9.3583
step 500: train loss 5.4727, val loss 5.8072
step 1000: train loss 5.0214, val loss 5.6553
step 1500: train loss 4.7014, val loss 5.5833
step 2000: train loss 4.4574, val loss 5.6121
step 2500: train loss 4.2367, val loss 5.6394
step 3000: train loss 4.0446, val loss 5.7057
step 3500: train loss 3.8765, val loss 5.7682
step 4000: train loss 3.7323, val loss 5.8592
step 4500: train loss 3.5989, val loss 5.9348
step 4999: train loss 3.4724, val loss 6.0415


In [None]:
sp.decode(0)

''

In [None]:
sp.Encode("First Citizen:")

[423, 807, 9964]

In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
context = torch.tensor(sp.Encode("SAM:"),device=DEVICE).unsqueeze(0)
print(sp.decode(m.generate(context, max_new_tokens=500)[0].tolist()))

SAM: and in our special friends; And make us therefore burn too; but that doth we will find at all your number: but mildly beat you words when as being mortal, being Proty to you must confess that, as you have heard do nature, I that love me any other witness; There doth a parcel were so! But old, more is forerun one in peace and blunt with the house to dry on fool. FRIAR LAURENCE: My eye, passing counsel you all o' the story of he resort: what title to the bear your form? Officer: Perhaps you is'll do't cannot do: but greets yet seldom but we go humbly oppose. SICINIUS: Away to his past my name. SICINIUS: Has he must take him A torch to him, Now would no further than he changed: All that loves him before his noble life that knew, But he wept, No looks was a true that seem wise by the Volscian lord of this while after itself thrust on his offence where his wife was. You are now the wondrous qualities, Lest she is rich in those storm to a man; We have braved to adorn And say aside the m