# Tokenizer based LM

Train the LM instead of character level, we will now train a token level LM

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
import tiktoken
%matplotlib inline

In [2]:
with open("../data/tiny-shakespeare/input.txt") as file:
    data = file.read()

len(data)

1115394

In [4]:
# load gpt-2 tokenizer
gpt2_tokenizer = tiktoken.encoding_for_model("gpt-2")
gpt2_tokenizer.n_vocab

50257

In [8]:
tokens = gpt2_tokenizer.encode(data)
len(tokens)

338025

In [15]:
type(tokens)

list

In [9]:
1115394 / 338025

3.2997381850454848

In [11]:
# prepare train and test data
split_idx = int(0.80 * len(tokens))
train_tokens = tokens[:split_idx]
val_tokens = tokens[split_idx:]
len(train_tokens), len(val_tokens)

(270420, 67605)

In [14]:
assert data == gpt2_tokenizer.decode(tokens)

In [16]:
# create the model

class MHA(nn.Module):
    def __init__(self, emb_dim, block_size, n_heads, head_dim, dropout):
        super().__init__()

        self.n_heads = n_heads
        self.head_dim = head_dim

        # 1st LayerNorm
        self.ln1 = nn.LayerNorm(emb_dim)

        # first Linear to get from emb_dim --> 3 * n_heads*head_dim, to get k,q,v, then proj back to emb_dim
        self.c_proj = nn.Linear(emb_dim, 3 * n_heads * head_dim, bias=False)
        self.proj = nn.Linear(n_heads * head_dim, emb_dim)

        # 2nd LayerNorm
        self.ln2 = nn.LayerNorm(emb_dim)

        # finally thinking layer
        self.ffn = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(4 * emb_dim, emb_dim)
        )

        self.dropout1 = nn.Dropout(dropout)

        # finally register the tril matrix
        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))

    def forward(self, x):
        # get the shape
        B, T, C = x.shape

        # Layer norm
        ln_x = self.ln1(x)

        # Project and extract k,q,v
        c = self.c_proj(ln_x) # (B,T,C)  --> (B,T,3*nh*H)
        c = c.view(B, T, self.n_heads, 3 * self.head_dim) # (B,T,nh,3*H)
        k, q, v = torch.split(c, self.head_dim, dim=-1) # each of shape B,T,nh,H
        k, q, v = k.transpose(-3, -2), q.transpose(-3, -2), v.transpose(-3, -2) # B, nh, T, H

        # Get the attention weights
        wei = q @ k.transpose(-2, -1) * (self.head_dim**-0.50) # (B,nh,T,H) @ (B,nh,H,T) -> (B,nh,T,T)
        wei = wei.masked_fill(self.mask[:, :, :T, :T] == 0, -float("inf"))
        wei = torch.softmax(wei, dim=-1)
        wei = self.dropout1(wei)

        # Apply to v
        act = wei @ v # (B,nh,T,T) @ (B,nh,T,H) -> (B,nh,T,H)
        act = act.transpose(-3, -2) # B,T,nh,H
        act = act.contiguous().view(B, T, self.n_heads * self.head_dim)

        # Transform to emb_dim and skip connection
        act = self.proj(act) # (B, T,C)
        act = x + act

        # Think and skip connections
        ln_act = self.ln2(act)
        out = self.ffn(ln_act) # (B,T,C)
        out = x + out # x shape (B,T,C)

        return out


class NanoGPT(nn.Module):
    def __init__(self, vocab_size, block_size, emb_dim, n_layers, n_heads, head_dim, dropout, device):
        super().__init__()

        # helper variables
        self.block_size = block_size
        self.device = device

        # Embedding lookup table
        self.token_embbeding_table = nn.Embedding(vocab_size, emb_dim)
        self.position_embedding_table = nn.Embedding(block_size, emb_dim)

        # MHA head
        self.MHA = nn.Sequential(*[MHA(emb_dim, block_size, n_heads, head_dim, dropout) for _ in range(n_layers)])

        # Layernorm
        self.ln = nn.LayerNorm(emb_dim)

        # final linear layer
        self.lm_layer = nn.Linear(emb_dim, vocab_size)

        # init weights
        self.apply(self._init_weights)

        print(f"Number of parameters: {sum([p.numel() for p in self.parameters()])}")

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)

    def forward(self, x, targets=None):
        # x shape (B, T)
        B, T = x.shape

        token_emb = self.token_embbeding_table(x)
        pos_emb = self.position_embedding_table(torch.arange(0, T).to(self.device))
        emb = token_emb + pos_emb

        emb = self.MHA(emb)
        emb = self.ln(emb)
        logits = self.lm_layer(emb) # (B, T, V)

        loss = None

        if targets is not None:
            B, T, V = logits.shape
            loss = F.cross_entropy(logits.view(B*T, V), targets.view(B*T))

        return logits, loss

    def generate(self, max_tokens=1000):
        with torch.no_grad():
            cur_window, idx_list = torch.LongTensor([[0]]).to(self.device), [0] # (1, 1)

            for i in range(max_tokens):
                cur_window = cur_window[:, -self.block_size:] # (1, B)
                logits, _ = self.forward(cur_window) # (1,B,V)
                probs = torch.softmax(logits, dim=-1).squeeze(dim=0) # (B,V)
                idx = torch.multinomial(probs, num_samples=1, replacement=True)[-1].item()
                cur_window = torch.concat([cur_window, torch.LongTensor([[idx]]).view(1, 1).to(self.device)], dim=-1)
                idx_list.append(idx)

            generated_text = gpt2_tokenizer.decode(idx_list)

            return generated_text

In [17]:
def get_batch(tokens, block_size, batch_size):
    batch = torch.randint(0, len(tokens)-block_size, (batch_size,)) # B dimension array of random indices
    Xb = torch.stack([torch.LongTensor(tokens[i:i+block_size]) for i in batch], dim=0) # Create (B, T) dimension array
    yb = torch.stack([torch.LongTensor(tokens[i+1:i+block_size+1]) for i in batch], dim=0) # Create (B, T) dimension array
    return Xb, yb

In [18]:
@torch.no_grad()
def compute_loss(tokens, block_size, batch_size, model, device):
    loss_values = []
    for _ in range(100):
        Xb, yb = get_batch(tokens, block_size, batch_size)
        Xb, yb = Xb.to(device), yb.to(device)

        _, loss = model(Xb, yb)
        loss_values.append(loss.item())

    mean_loss = torch.FloatTensor(loss_values).mean().item()
    return mean_loss

In [25]:
def train(train_tokens, val_tokens, model, optimizer, device, block_size, batch_size, n_iters, eval_interval):
    train_lossi, val_lossi = [], []

    for i in range(n_iters):
        model.train()
        Xb, yb = get_batch(train_tokens, block_size, batch_size)
        Xb, yb = Xb.to(device), yb.to(device)

        # forward
        _, loss = model(Xb, yb)

        # set grads to zero
        optimizer.zero_grad(set_to_none=True)

        # do backward
        loss.backward()

        # optimizer step
        optimizer.step()

        if (i % eval_interval == 0) or (i == n_iters - 1):
            model.eval()
            train_loss = compute_loss(train_tokens, block_size, batch_size, model, device)
            val_loss = compute_loss(val_tokens, block_size, batch_size, model, device)

            train_lossi.append(train_loss)
            val_lossi.append(val_loss)

            print(f"Step {i}/{n_iters} --> Train: {train_loss:.4f} | Val: {val_loss:.4f}")

        # break

    return train_lossi, val_lossi

In [26]:
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
n_iters = 5000
eval_interval = n_iters//10
lr = 3e-4
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
emb_dim = 32
n_heads = 4
head_dim = emb_dim // n_heads
n_layers = 1
dropout = 0.2
vocab_size = gpt2_tokenizer.n_vocab

In [27]:
model = NanoGPT(emb_dim=emb_dim, vocab_size=vocab_size, block_size=block_size, n_heads=n_heads,\
                 n_layers=n_layers, head_dim=head_dim, device=device, dropout=dropout)
model = model.to(device)

Number of parameters: 3279633


In [28]:
optimizer = optim.AdamW(model.parameters(), lr=lr)

In [29]:
train_lossi, val_lossi = train(train_tokens=train_tokens, val_tokens=val_tokens, model=model, optimizer=optimizer,\
      device=device, block_size=block_size, batch_size=batch_size, n_iters=n_iters, eval_interval=eval_interval)

Step 0/5000 --> Train: 10.8293 | Val: 10.8278
Step 500/5000 --> Train: 6.3177 | Val: 6.5101
Step 1000/5000 --> Train: 5.5742 | Val: 5.9234
Step 1500/5000 --> Train: 5.1121 | Val: 5.5991
Step 2000/5000 --> Train: 4.7973 | Val: 5.4119
Step 2500/5000 --> Train: 4.5589 | Val: 5.3498
Step 3000/5000 --> Train: 4.3892 | Val: 5.2783
Step 3500/5000 --> Train: 4.3030 | Val: 5.2720
Step 4000/5000 --> Train: 4.2083 | Val: 5.2964
Step 4500/5000 --> Train: 4.1260 | Val: 5.2672
Step 4999/5000 --> Train: 4.0519 | Val: 5.2912


In [23]:
# Expected first loss:
import math
math.log(vocab_size)

10.82490511970208

In [30]:
model.eval();

In [31]:
display(Markdown(model.generate()))

! O, give, so excuse with some silk, by thy leader;
For God! be wine,
Or so ill.
Of the fisher 'pulled from that the fortune'OPSA:
Yet prismuath.

First Gentleman:
Your brother's likely thou keep but smooth'd, consider'd: you, by my feel their means,

Tope their found the crown with yonder grief,
That hate it scroll!'
I rail her, he manners home to appeurt Laura to sleep as;
And all the people cannot be was four'd for him:
Pray my lord.
Therefore revenge a neighbour right.

BUCKINGHAM:
Unless you might wear the babe, has not off.

First T clamew violent.

This day is good from Rome was?
I voice the first die; and as part of you,
Meant decay. Youued the counsel
When my shame honour shall do twenty thousand-t,
 push to hear them gave thee.

WARWICK:
A soldier, and soon a king's frown,
If't had there in the poor sword of many-morrow,
Or go of the great law. We will--

CORIOLANUS:
I neither good to lay it draws pick'd and their Barninail to all both:
Ay,
And struck best at form; as revenge him seeming were young, let; I I may have prevail'd
Whether the king as dull?

KING RICHARD: but the eye, repent,
Where gracious night, that land that oath could wish me here.
After it is for the hand''s side?
Toe'st means to have quite all it toid eachest!

LEONTES:
The poor man is above chks Castle.

BUCKINGHAM:
And go: go, at my heart naked; he is.

KING RICHARDOUCALUS:
No unman Leontes speak.

CORIOLANUS:
But in them AUMNurse:
Able upon't
And, our well: herenone, then is heavens
GE:
And Clarence will you coming withouthest night: I brother, by their worship from your guest
Provost:
And fright it as nays but to Romeo! who's suit of his weapon
Eness them.

LADY CAPULET:
What, that is I'll none done the fair;
Thy flower, do I will hold his life:
Thou follow you, I would--

ROMEO:
Come, to have home: when the happy Worth is submission.

KING RICHARD II:
He makes lived there: Ie thy flattland is--

GLOUCALUS:
That lies ch herb of this.

Pray a feeling, he was wrong?

NIA:
But thou, first with their Party, play indeed;
But by our horse of roses, on it: to me
If it not this fair my fellows:
Here hath some ground that be
and confretch are in blood
s, after Warwick's name:
And smell to England was black assured I see
ook more more, I'll give you all a pupil every lesser.

Second Lord:
Ogrowing and the house of
Some tumultuous articulate of York. throne:
How know, how he is noble as
Comm'd the care to Elils.


CLIFFORD:
ld hide him?

GLOUCAMILLO:
I therefore, she'reeling on this army spent
Fare thee Buckingham.
Now, their provostCIUS:
And she was; as you that I plant love do quickly please your grace that? prison, rather
The moon of every hand of one in the nextbury,
That ever general live with us i' the sweet butcher of this great trick
And their influences, God-scient: I might would reconcil!
I beg in MOWY:
Never and look months inize with a world?

CATESBY:
Parest they would be the target to my lord, and that cried anon to crave England.

Yre I may devise to be
Thouite Mercutoth we can, and their death?
But Juliet, and teach thee the hell,
Which would have at't in this one,
Thy condition doth seem from thy worthy brother?
Well, he
Not both:
For she was my father; therefore, Warwick?
Because he vengeance hath received:ILLman:
Will to protest, Suffolk, they that our right's happy fellest