In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import random
from tqdm import tqdm

Tokenizers

In [2]:
class CharTokenizer:
    def __init__(self, text):
        self.text = text
        self.vocab = sorted(list(set(text)))
        self.vocab_size = len(self.vocab)
        self.char_to_int = { ch: i for i, ch in enumerate(self.vocab) }
        self.int_to_char = { i: ch for i, ch in enumerate(self.vocab) }

    def encode(self, s):
        return [self.char_to_int[ch] for ch in s]

    def decode(self, ints):
        return ''.join(self.int_to_char[i] for i in ints)

class TikTokenizer:
    def __init__(self):
        import tiktoken
        self.enc = tiktoken.get_encoding('gpt2')
        self.vocab = [self.enc.decode([i]) for i in range(self.enc.n_vocab)]
        self.vocab_size = self.enc.n_vocab

    def encode(self, s):
        return self.enc.encode(s)

    def decode(self, ints):
        return self.enc.decode(ints)

Splitting Methods

In [3]:
class BasicSplit:
    def __init__(self, val_fraction=0.1):
        self.val_fraction = val_fraction

    def split(self, data):
        n = int((1-self.val_fraction) * len(data))
        train_data = data[:n]
        val_data   = data[n:]
        return train_data, val_data

class ChunkSplit:
    def __init__(self, val_fraction=0.1, num_chunks=12, encode=lambda x: x):

        self.val_fraction = val_fraction
        self.num_chunks   = num_chunks
        self.encode       = encode

    def split(self, data):
        newline_token = self.encode("\n")

        val_fraction = self.val_fraction
        num_chunks   = self.num_chunks

        total_len = data.size(0)
        val_len   = int(val_fraction * total_len)
        chunk_len = val_len // num_chunks
        remainder = val_len % num_chunks

        # split the entire dataset into chunks
        chunks = []
        idx = 0
        while idx + chunk_len <= total_len:
            chunks.append(data[idx : idx + chunk_len])
            idx += chunk_len

        # the leftover chunk (if any) after slicing out as many `chunk_len` blocks as possible. We always put this leftover into training by default
        leftover = data[idx:]

        val_indices = sorted(random.sample(range(len(chunks)), num_chunks))

        # build the validation data (insert newline_token only between non-consecutive chunks)
        val_data = []
        prev_idx = -2  # something not adjacent to first pick 
        for i in val_indices:
            # if the current chunk is NOT directly after the previous one and val_data already has content,
            # then insert a newline token
            if i != prev_idx + 1 and len(val_data) > 0:
                val_data.append(torch.tensor(newline_token, dtype=torch.int64))
            val_data.append(chunks[i])
            prev_idx = i

        # concatenate chosen val chunks
        val_data = torch.cat(val_data) if val_data else torch.empty(0, dtype=torch.int64)

        # if we have a remainder that is significant, tack data from train onto the end. This is ugly and bad and should be changed
        # this ensures total validation tokens == val_len but can also result in a small amount of overlap (up to chunk_length -1 tokens) between val and train
        if remainder > 100:
            val_data = torch.cat((val_data, data[-remainder:]))

        # everything else is training: the unchosen chunks plus leftover
        train_indices = sorted(set(range(len(chunks))) - set(val_indices))
        train_data_list = []
        prev_idx = -2  # so first chunk won't auto-insert a newline
        for i in train_indices:
            # if this chunk is not consecutive to the previous one, insert a newline
            if i != prev_idx + 1 and len(train_data_list) > 0:
                train_data_list.append(torch.tensor(newline_token, dtype=torch.int64))
            train_data_list.append(chunks[i])
            prev_idx = i

        train_data_list.append(torch.tensor(newline_token, dtype=torch.int64))
        train_data_list.append(leftover)
        train_data = torch.cat(train_data_list) if train_data_list else torch.empty(0, dtype=torch.int64)
        return train_data, val_data


Load Data

In [4]:
# input_file_path = "datasets/tinyshakespeare.txt"
input_file_path = "datasets/big.txt"
with open(input_file_path, 'r', encoding='utf-8') as f:
    train_text = f.read()

Hyper Params

In [5]:
# tokenizer = CharTokenizer(train_text)
tokenizer = TikTokenizer()
encode = tokenizer.encode
decode = tokenizer.decode
vocab = tokenizer.vocab
vocab_size = tokenizer.vocab_size
val_fraction = 0.1
splitter = ChunkSplit(val_fraction=0.1, num_chunks=16, encode = encode)

In [6]:
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 2e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 512
n_layer = 8
n_head = 8
dropout = 0.25

In [7]:
print(device)

cuda


Make Training Data

In [8]:
data = torch.tensor(encode(train_text), dtype=torch.long)
train_data, val_data = splitter.split(data)

Data Loading

In [9]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [10]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)

        # lower-triangular mask for future tokens (causal attention)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        # project x to key, query, value
        k = self.key(x)    # (B, T, head_size)
        q = self.query(x)  # (B, T, head_size)

        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C**-0.5  # (B, T, T)
        # mask out future positions for causal language modeling
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        # softmax to get attention weights
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        # dropout layer
        wei = self.dropout(wei)
        # weighted aggregation of the values
        v = self.value(x)  # (B, T, head_size)
        out = wei @ v       # (B, T, T) @ (B, T, head_size) -> (B, T, head_size)
        return out

In [11]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [12]:
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [13]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.ln1 = nn.LayerNorm(n_embd)
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ln2 = nn.LayerNorm(n_embd)
        self.ffwd = FeedForward(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

Bigram Model

In [14]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        
        layers = []
        for _ in range(n_layer):
            layers.append(Block(n_embd, n_head=n_head))
        
        # Put all blocks in a Sequential container
        self.blocks = nn.Sequential(*layers)
        
        # final layer norm and output head
        self.ln_f = nn.LayerNorm(n_embd)    # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B, T, C) or (B, T, n_embd)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C)
        x = self.blocks(x) # apply a head of self attention
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [15]:
model = BigramLanguageModel()
model = model.to(device)

Training

In [16]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [17]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [18]:
for iter in tqdm(range(max_iters), desc="Training"):

    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

Training:   0%|          | 0/5000 [00:00<?, ?it/s]

step 0: train loss 11.0000, val loss 11.0028


Training:  10%|█         | 500/5000 [02:41<18:53,  3.97it/s] 

step 500: train loss 5.2896, val loss 5.5251


Training:  20%|██        | 1000/5000 [05:18<16:34,  4.02it/s]  

step 1000: train loss 4.6263, val loss 4.9979


Training:  30%|███       | 1500/5000 [07:54<14:27,  4.03it/s]   

step 1500: train loss 4.2385, val loss 4.7655


Training:  40%|████      | 2000/5000 [10:35<13:18,  3.76it/s]  

step 2000: train loss 3.9811, val loss 4.6446


Training:  50%|█████     | 2500/5000 [13:19<10:41,  3.90it/s]  

step 2500: train loss 3.7816, val loss 4.5798


Training:  60%|██████    | 3000/5000 [16:03<09:12,  3.62it/s]  

step 3000: train loss 3.5705, val loss 4.5191


Training:  70%|███████   | 3500/5000 [18:47<06:30,  3.84it/s]  

step 3500: train loss 3.4328, val loss 4.5014


Training:  80%|████████  | 4000/5000 [21:29<04:26,  3.75it/s]  

step 4000: train loss 3.2461, val loss 4.4994


Training:  90%|█████████ | 4500/5000 [24:09<02:07,  3.94it/s]  

step 4500: train loss 3.1029, val loss 4.5082


Training: 100%|█████████▉| 4999/5000 [26:52<00:00,  4.00it/s]  

step 4999: train loss 2.9519, val loss 4.5355


Training: 100%|██████████| 5000/5000 [27:25<00:00,  3.04it/s]


Generate From the Model

In [19]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=2000)[0].tolist()))

!
This oak is bowed and it if someone were talking eagerly because they had a
written, to crush her mistakes, her heart ever so before.

The old count saw Frenchmen and still close on the table, which she
steading a coquickerent beneath the
marrowsy, several Count Bezukhov runghed, wrapped up her hands through the
house. The child's hand timidly and she jumped
and pressed it on the madiour either side of Peterearing the deep woman
bounds. The footmen got ready yet the tea table. Anisya was Bogdanich
asantly Baron.

Anti-in sat, standing in a thirdident with a restrained
tressmen approached the gold Preparer. Toward suddenly with prominent
it Berg on his lipsore back, Nicholas, said, that it was not necessary to
command under his excellency itself, nor should go back, to sleep about
Pierre and now it all his movements.

He looked round, as a scarf passed in a piece of blood without the cap
suspending firing at the root of a small dog-blue scarf on
round the table.

"Yes, General-diarrh!