### Read the dataset

In [None]:
with open('/content/input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

### Data exploration and preprocessing of dataset

In [None]:
print("Length of the text is: {}".format(len(text)))

Length of the text is: 1115394


In [None]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [None]:
chars = sorted(list(set(text)))
print("".join(chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [None]:
vocab_size = len(chars)
print("Vocabulary size is: {}".format(vocab_size))

Vocabulary size is: 65


### Tokenization

There are different ways to tokenize the text. Google uses SentencePiece tokenizer mechanism [Link](https://github.com/google/sentencepiece) that uses subwords to tokenize the sentences. OpenAI which created chatCPT uses tiktoker library developed by them [Link](https://github.com/openai/tiktoken) that uses a BPE (byte-pair encoding) to tokenize.

For this work we will be using character level encoding (converting the text to their ASCII equivalent).

There is a tradeoff between the encoding numbers (below it is the ASCII codes) and the sequence length generated (a code for every character below).

In [None]:
stoi = {ch: i for i,ch in enumerate(chars)}
itos = {i: ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

print("Encoding for the string \"hi there\" is: {}".format(encode("hii there")))
print("Decoding for the string \"hi there\" is: {}".format(decode(encode("hii there"))))

Encoding for the string "hi there" is: [46, 47, 47, 1, 58, 46, 43, 56, 43]
Decoding for the string "hi there" is: hii there


### Encoding the entire data and using storing them as tensors

In [None]:
import torch

In [None]:
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

### Train test split

In [None]:
train_data_len = int(0.9 * len(data))

In [None]:
train_data = data[:train_data_len]
val_data = data[train_data_len:]

### Batch size and batches for training

In [None]:
# this is the chunk of data that will be fed to model at once, this might be known as context-length in other cases

batch_size = 64 # how many independent sequences will we process in parellel?
block_size = 256 # What is the maximum length allowed for prediction?
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
eval_interval = 500
max_iters = 5000
n_embed = 384
n_head = 6
n_layer = 6
dropout = 0.2
# if learning rate is lower, number of iterations should be higher
lr = 3e-4
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

# for i,c in enumerate(x):
#     print("When the context is {}, the most probable next data is {}".format(x[:i+1], y[i]))

In [None]:
torch.manual_seed(1337)

def get_batch(split):
    # generate a small batch of data of input x and outputs y
    data = train_data if split=="train" else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)

    return x, y

In [None]:
xb, yb = get_batch("train")
print(f"Inputs are {xb} with shape {xb.shape}")
print(f"Outputs are {yb} with shape {yb.shape}")

Inputs are tensor([[ 0, 26, 53,  ..., 56, 43, 47],
        [60, 43, 56,  ..., 56,  1, 41],
        [26, 21, 33,  ..., 26, 21, 13],
        ...,
        [ 5, 57,  1,  ...,  1, 35, 47],
        [56, 53, 53,  ..., 59, 50, 42],
        [42, 47, 56,  ..., 39, 56,  1]], device='cuda:0') with shape torch.Size([64, 256])
Outputs are tensor([[26, 53, 58,  ..., 43, 47, 45],
        [43, 56,  1,  ...,  1, 41, 53],
        [21, 33, 31,  ..., 21, 13, 10],
        ...,
        [57,  1, 52,  ..., 35, 47, 50],
        [53, 53, 58,  ..., 50, 42,  1],
        [47, 56, 43,  ..., 56,  1, 51]], device='cuda:0') with shape torch.Size([64, 256])


In [None]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
#         print(f"When input is {context.tolist()} then most probable output is {target.tolist()}")

### Baseline mode - Bigram model

### Attention

1. Attention is a communication mechanism
2. There is no notion of space, they are just vectors without any idea of dimension thats why we add positional encoding.
3. Every batch is independently trained and they never talk to each other.
4. "Self attention" - The key, query and value matrix are all coming from the same source.
5. "Cross attention" - When we pull matrix key and value from different nodes, this is called cross attention.
6. If we have unit gaussian input when we set our matrix, the variance of the resultant matrix will be of the order of head_size which is far from when we had started setting the matrix up. If we multiply by sqrt(head_size) then we preserve information. This is important so that the the result after softmax is not sharpened towards the maximum value.

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x7edf18081c50>

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape

        k = self.key(x) # (B,T,16)
        q = self.query(x) # (B,T,16)

        # compute attention score ("affinities to other tokens around")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B,T,16) @ (B,16,T) => (B,T,T)
        wei = wei.masked_fill(self.tril[:T,:T]==0, float('-inf')) # (B,T,T)
        wei = F.softmax(wei, dim=-1) # (B,T,T)
        wei = self.dropout(wei)

        # perform the aggrrgation
        v = self.value(x) # (B,T,C)

        out = wei @ v # (B,T,T) @ (B,T,C) => (B,T,C)

        out.shape
        return out

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))

        return out

In [None]:
class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4*n_embed),
            nn.ReLU(),
            nn.Linear(4*n_embed, n_embed),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class Block(nn.Module):
    """Communication followed by computation"""
    def __init__(self,n_embed, n_head):
        # n_embed => embedding dimension
        # n_head => number of heads we would like
        super().__init__()
        head_size = n_embed//n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        # in original paper, LayerNorm is applied to the output of Multi head attention and FF, but in recent times
        # there is slight deviation and it is applied to the input of those components
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))

        return x

In [None]:
class BiGramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()

        # each token directly reads the logits of the next token from lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        # position embedding table
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        # Multiple blocks of self attention head

        self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embed) # final layer norm

        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B,T = idx.shape

        # idx and target are both (B,T) tensors of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C) => (Batch, Time, Channel=n_embed) => (4,8,vocab_size)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)

        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        # cross entropy expects the output as (B, C, T), so we need to reshape

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # we need to crop idx because our embedding size is limited to block size now
            idx_cropped = idx[:,-block_size:]

            # get the predictions
            logits, loss = self(idx_cropped)
            # focus only on the last tim step
            logits = logits[:,-1,:] # becomes (B,C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B,1)
            # append sampled index to running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
        return idx

model = BiGramLanguageModel()
m = model.to(device)
# logits, loss = m(xb,yb)
# print(logits.shape)
# print(loss)

We can actually predict the log liklihood of the data if we know the vocab size, it is `-ln(1/vocab_size)`.

#### Now let us create the model and train it

In [None]:
# create a Pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=lr)

In [None]:
from tqdm import tqdm

In [None]:
for iter in tqdm(range(max_iters)):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 0/5000 [00:00<?, ?it/s]

step 0: train loss 2.1455, val loss 2.1991


 10%|█         | 500/5000 [05:15<36:51,  2.03it/s]

step 500: train loss 1.6804, val loss 1.8427


 20%|██        | 1000/5000 [10:29<32:35,  2.05it/s]

step 1000: train loss 1.4990, val loss 1.6905


 30%|███       | 1500/5000 [15:43<28:39,  2.04it/s]

step 1500: train loss 1.3918, val loss 1.6107


 40%|████      | 2000/5000 [20:57<24:26,  2.05it/s]

step 2000: train loss 1.3228, val loss 1.5605


 50%|█████     | 2500/5000 [26:11<20:20,  2.05it/s]

step 2500: train loss 1.2641, val loss 1.5290


 60%|██████    | 3000/5000 [31:24<16:17,  2.05it/s]

step 3000: train loss 1.2193, val loss 1.5142


 70%|███████   | 3500/5000 [36:37<12:12,  2.05it/s]

step 3500: train loss 1.1813, val loss 1.4960


 80%|████████  | 4000/5000 [41:51<08:08,  2.05it/s]

step 4000: train loss 1.1421, val loss 1.4920


 90%|█████████ | 4500/5000 [47:04<04:03,  2.05it/s]

step 4500: train loss 1.1045, val loss 1.4863


100%|██████████| 5000/5000 [52:17<00:00,  1.59it/s]


In [None]:
print(decode(m.generate(idx=torch.zeros((1,1), dtype = torch.long, device = device), max_new_tokens=1000)[0].tolist()))


ROMEO:
But believe thy faith intertaints
Even leave in post, blows, revengeant
To this do tempt adversart death.

MERCUTIO:
I am glieve with Fourth, great, tender'd Rates from the poor
And bear honour!

ROMEO:

MERCUTIO:
Apon that will we may be your to arm this lave
And beards.
This three word gnarly thine: hark you go'd,--what liers?
How dost thou art thou wife! I see her merving blood
By now his name with use and linking,
And believed; I here hurrand. Loverly then
I did empt you to do him poison with thy master's heavy:
That every are made's own gentleman
To there oratom'd willieves an eyes, he loved at his
general intent, they had as to-day; to take it.

BENVOLIO:
This my life, and my society as we are judged means,
I have died together'd interror herewit.
O, thus, say to-day.

ANGELO:
Barehold:
Then lie to the breath, and more.

First Gentleman:
She lady; I singly come the army thing: the nurse of veiolenes
Masters madely bury in the beams treadful bow.
How faresh receives? why f

# Random testing (Can ignore this section)

In [None]:
torch.manual_seed(1337)
B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
# version 1
# xbow is x bag of words
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] #(t,C)
        xbow[b,t] = torch.mean(xprev, 0)

In [None]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [None]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

The above method is inefficient as we are doing it in n^2 complexity, we can use matrix multiplication to lower the time complexity. We will use the lower triagular matrix and make it more efficient.

In [None]:
# version 2

wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x
# both the torch vectors are same, this is the more efficient way
torch.allclose(xbow, xbow2)

True

In [None]:
# version 3

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [None]:
# version 4
# Self attention !

torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)
x.shape

# single head perform self attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B,T,16)
q = query(x) # (B,T,16)

wei = q @ k.transpose(-2,-1) # (B,T,16) @ (B,16,T) => (B,T,T)
tril = torch.tril(torch.ones(T,T))

# wei is equivalent to the np.dot(Q,K)
#wei comes from Q.K_transpose
# wei = torch.zeros((T,T))

# this line of code prevents future nodes to communicate information to past nodes. If removed every node(token)
# will interact with each other. This is removed on the encoder side of the transformers.
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)

out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [None]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)