# Tokenizer based LM

Train the LM instead of character level, we will now train a token level LM

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from IPython.display import display, Markdown
import tiktoken
import wandb
%matplotlib inline

In [2]:
with open("../data/tiny-shakespeare/input.txt") as file:
    data = file.read()

len(data)

1115394

In [3]:
# load gpt-2 tokenizer
gpt2_tokenizer = tiktoken.encoding_for_model("gpt-2")
gpt2_tokenizer.n_vocab

50257

In [4]:
tokens = gpt2_tokenizer.encode(data)
len(tokens)

338025

In [5]:
type(tokens)

list

In [6]:
1115394 / 338025

3.2997381850454848

In [7]:
# prepare train and test data
split_idx = int(0.80 * len(tokens))
train_tokens = tokens[:split_idx]
val_tokens = tokens[split_idx:]
len(train_tokens), len(val_tokens)

(270420, 67605)

In [8]:
assert data == gpt2_tokenizer.decode(tokens)

In [9]:
# create the model

class MHA(nn.Module):
    def __init__(self, emb_dim, block_size, n_heads, head_dim, dropout):
        super().__init__()

        self.n_heads = n_heads
        self.head_dim = head_dim

        # 1st LayerNorm
        self.ln1 = nn.LayerNorm(emb_dim)

        # first Linear to get from emb_dim --> 3 * n_heads*head_dim, to get k,q,v, then proj back to emb_dim
        self.c_proj = nn.Linear(emb_dim, 3 * n_heads * head_dim, bias=False)
        self.proj = nn.Linear(n_heads * head_dim, emb_dim)

        # 2nd LayerNorm
        self.ln2 = nn.LayerNorm(emb_dim)

        # finally thinking layer
        self.ffn = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(4 * emb_dim, emb_dim)
        )

        self.dropout1 = nn.Dropout(dropout)

        # finally register the tril matrix
        self.register_buffer("mask", torch.tril(torch.ones(block_size, block_size)).view(1, 1, block_size, block_size))

    def forward(self, x):
        # get the shape
        B, T, C = x.shape

        # Layer norm
        ln_x = self.ln1(x)

        # Project and extract k,q,v
        c = self.c_proj(ln_x) # (B,T,C)  --> (B,T,3*nh*H)
        c = c.view(B, T, self.n_heads, 3 * self.head_dim) # (B,T,nh,3*H)
        k, q, v = torch.split(c, self.head_dim, dim=-1) # each of shape B,T,nh,H
        k, q, v = k.transpose(-3, -2), q.transpose(-3, -2), v.transpose(-3, -2) # B, nh, T, H

        # Get the attention weights
        wei = q @ k.transpose(-2, -1) * (self.head_dim**-0.50) # (B,nh,T,H) @ (B,nh,H,T) -> (B,nh,T,T)
        wei = wei.masked_fill(self.mask[:, :, :T, :T] == 0, -float("inf"))
        wei = torch.softmax(wei, dim=-1)
        wei = self.dropout1(wei)

        # Apply to v
        act = wei @ v # (B,nh,T,T) @ (B,nh,T,H) -> (B,nh,T,H)
        act = act.transpose(-3, -2) # B,T,nh,H
        act = act.contiguous().view(B, T, self.n_heads * self.head_dim)

        # Transform to emb_dim and skip connection
        act = self.proj(act) # (B, T,C)
        act = x + act

        # Think and skip connections
        ln_act = self.ln2(act)
        out = self.ffn(ln_act) # (B,T,C)
        out = x + out # x shape (B,T,C)

        return out


class NanoGPT(nn.Module):
    def __init__(self, vocab_size, block_size, emb_dim, n_layers, n_heads, head_dim, dropout, device):
        super().__init__()

        # helper variables
        self.block_size = block_size
        self.device = device

        # Embedding lookup table
        self.token_embbeding_table = nn.Embedding(vocab_size, emb_dim)
        self.position_embedding_table = nn.Embedding(block_size, emb_dim)

        # MHA head
        self.MHA = nn.Sequential(*[MHA(emb_dim, block_size, n_heads, head_dim, dropout) for _ in range(n_layers)])

        # Layernorm
        self.ln = nn.LayerNorm(emb_dim)

        # final linear layer
        self.lm_layer = nn.Linear(emb_dim, vocab_size)

        # init weights
        self.apply(self._init_weights)

        print(f"Number of parameters: {sum([p.numel() for p in self.parameters()])}")

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)

    def forward(self, x, targets=None):
        # x shape (B, T)
        B, T = x.shape

        token_emb = self.token_embbeding_table(x)
        pos_emb = self.position_embedding_table(torch.arange(0, T).to(self.device))
        emb = token_emb + pos_emb

        emb = self.MHA(emb)
        emb = self.ln(emb)
        logits = self.lm_layer(emb) # (B, T, V)

        loss = None

        if targets is not None:
            B, T, V = logits.shape
            loss = F.cross_entropy(logits.view(B*T, V), targets.view(B*T))

        return logits, loss

    def generate(self, max_tokens=1000):
        with torch.no_grad():
            cur_window, idx_list = torch.LongTensor([[0]]).to(self.device), [0] # (1, 1)

            for i in range(max_tokens):
                cur_window = cur_window[:, -self.block_size:] # (1, B)
                logits, _ = self.forward(cur_window) # (1,B,V)
                probs = torch.softmax(logits, dim=-1).squeeze(dim=0) # (B,V)
                idx = torch.multinomial(probs, num_samples=1, replacement=True)[-1].item()
                cur_window = torch.concat([cur_window, torch.LongTensor([[idx]]).view(1, 1).to(self.device)], dim=-1)
                idx_list.append(idx)

            generated_text = gpt2_tokenizer.decode(idx_list)

            return generated_text

In [10]:
def get_batch(tokens, block_size, batch_size):
    batch = torch.randint(0, len(tokens)-block_size, (batch_size,)) # B dimension array of random indices
    Xb = torch.stack([torch.LongTensor(tokens[i:i+block_size]) for i in batch], dim=0) # Create (B, T) dimension array
    yb = torch.stack([torch.LongTensor(tokens[i+1:i+block_size+1]) for i in batch], dim=0) # Create (B, T) dimension array
    return Xb, yb

In [11]:
@torch.no_grad()
def compute_loss(tokens, block_size, batch_size, model, device):
    loss_values = []
    for _ in range(100):
        Xb, yb = get_batch(tokens, block_size, batch_size)
        Xb, yb = Xb.to(device), yb.to(device)

        _, loss = model(Xb, yb)
        loss_values.append(loss.item())

    mean_loss = torch.FloatTensor(loss_values).mean().item()
    return mean_loss

In [17]:
def train(train_tokens, val_tokens, model, optimizer, device, block_size, batch_size, n_iters, eval_interval):
    train_lossi, val_lossi = [], []

    for i in range(n_iters):
        model.train()
        Xb, yb = get_batch(train_tokens, block_size, batch_size)
        Xb, yb = Xb.to(device), yb.to(device)

        # forward
        _, loss = model(Xb, yb)

        # set grads to zero
        optimizer.zero_grad(set_to_none=True)

        # do backward
        loss.backward()

        # optimizer step
        optimizer.step()

        if (i % eval_interval == 0) or (i == n_iters - 1):
            model.eval()
            train_loss = compute_loss(train_tokens, block_size, batch_size, model, device)
            val_loss = compute_loss(val_tokens, block_size, batch_size, model, device)

            train_lossi.append(train_loss)
            val_lossi.append(val_loss)

             # log metrics to wandb
            wandb.log({"train_loss": train_loss, "val_loss": val_loss})
        # break

    return train_lossi, val_lossi

In [13]:
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
n_iters = 5000
eval_interval = n_iters//10
lr = 3e-4
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
emb_dim = 32
n_heads = 4
head_dim = emb_dim // n_heads
n_layers = 1
dropout = 0.2
vocab_size = gpt2_tokenizer.n_vocab

In [14]:
# Wandb init
wandb.init(
    # set the wandb project where this run will be logged
    project="nano-gpt-token-small",

    # track hyperparameters and run metadata
    config={
    "batch_size": batch_size,
    "block_size": block_size,
    "n_iters": n_iters,
    "learning_rate": lr,    
    "emb_dim": 32,
    "n_heads": 4,
    "head_dim": emb_dim // n_heads,
    "n_layers": 1,
    "dropout": 0.2,
    "vocab_size": gpt2_tokenizer.n_vocab
    }
)

[34m[1mwandb[0m: Currently logged in as: [33msoham07-mistri12[0m ([33msoham-mistri-personal[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011141871766368341, max=1.0…

In [15]:
model = NanoGPT(emb_dim=emb_dim, vocab_size=vocab_size, block_size=block_size, n_heads=n_heads,\
                 n_layers=n_layers, head_dim=head_dim, device=device, dropout=dropout)
model = model.to(device)

Number of parameters: 3279633


In [16]:
optimizer = optim.AdamW(model.parameters(), lr=lr)

In [18]:
train_lossi, val_lossi = train(train_tokens=train_tokens, val_tokens=val_tokens, model=model, optimizer=optimizer,\
      device=device, block_size=block_size, batch_size=batch_size, n_iters=n_iters, eval_interval=eval_interval)

In [23]:
# Expected first loss:
import math
math.log(vocab_size)

10.82490511970208

In [19]:
model.eval();

In [20]:
display(Markdown(model.generate()))

! her hate your eyes task the mother's ghostly,
To know, a weather!

GLOUCESTER:
Then then at a greaterDevl:
When it dukaunt news is in the blood in the train:
WICK:
So now he but ask'd you to his Lord Angelo;
The city are amended.

GLOUCY:
What all the field;man ay, and woo'd him our father
Unay order Agman to the same trunk, when I'll make some our unlawful.

Yoe'st thou would stale even all;
I thought of yet it be thrive:
First Watchman:
What word I will seek a poor day's blood food,
And glanced hunger lingering simply our knowledge: you
To be flood.

COMINIUS:
I shall not have broinating tinstalled wouldp to this envious lock'd.

QUEEN is at't not from senators
Lord comes the man! Come, I use to laugh, let's ladies
MOPSA:
 slandering to love, I
By favours from so Gaunt, Pompey, that he will be climb,
And not bless such gracious lie dead!

ROMEO:
An sovereign, my lord, my sovereign plays your liege, then I suming than wind'd, and, man is my friends:
Ay to puts valiant, and at mine neighbours, with heaven speaks
I never have very three-day;

To know the sun, in the lantern,
She had hath done to this:
Come, if him will; we could you bears,
Of unilies of the most noble.
Thou, i' the oath, nor would not the king.

ANTIGONUS:
That thou art visibleurer is he wuscle, whilst I am indeed to the aines.

Dok masend me to use that they ope the oldest and!--
When you princes that and desperate one doth your speech: therefore speak.

KING RICHARD III:
It shall know thy shining much; no swither all, dancesey my noble for going
To fly theurden'd and ask,
If you do find a thousand sword
pacedTRESS OF YORK:
Pately equal much husband against her within the pierce fiCIUS:
Thy banish'd in the Romans,
As ourque to the bloody soul,
Were at sorrow. O, that had he is the wind:
How so is most Hastings, strike, and the pleasure of a friendly name, and more than it takes of Richard's very limbs: this is a king
With nothing admit stampeded, cousin! shortly should
We keep me like protest's Rosley.

LEONTES:
Then I loves the bowues I am stay
If been man of your esolsce:
Lady:
O:
Why call it in thy love, no more indeed, sir;
What is the impat'd or strike. You,
Enough as whose caps on, exceeds the manner.

First Senator:
Ripp'd, and neglected achieved there
But I came:
For fame are at now;ick, thou art serves it else,
 conclude friends and death
A danger and your faith, on grief would he hathillo to the w Adrian's crown
MAMILLO:
Call his witness and murder less p sol Citizen:
Therefore your queen to thee, thatily stands,
Thechingthankful than you have all too hungry?

DUKEYea, you bait import by the weary head.

QUEENRY VI:
Why, my lords; thou allegiance maystable.

B goose, which ne'er hadst revive my tyranny;
The friend, what that
 Tingbroke when
Which they are; pleasing expedience?
Your monarchPeace is made:
Wash and so drew, if my Romeo's woe, somethingicious trowhood,
Vereth to justice will lose the king.
man! that this is nothing, and serve on a my gracious lord
Awade me be vade it will tell him hold me: I cry you now.

Clown:
going to be reven me on behind;
Have blessed authority married action
ines in spectators; you haveNurse:
That I gasby that lies, my frost and odd gh Forrest, and my lord's love, great Servingmen,
To this inevitable DF, if the blessings in thy sword, how told doth there now high's troth
 banished against the head,
We cannot not we must bear ask you,
The sweet knee, if the praises fail:

In [21]:
wandb.finish()

0,1
train_loss,█▃▃▂▂▂▁▁▁▁▁
val_loss,█▃▂▁▁▁▁▁▁▁▁

0,1
train_loss,4.05224
val_loss,5.28553
