In [20]:
#Download as input.txt: https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# !pip install tiktoken #https://github.com/openai/tiktoken

In [1]:
import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)
from itertools import chain #Deal with nested list


In [16]:
num_embeddings=32

## Reading text data

In [17]:
# Reading file 
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [18]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [19]:
vocab_size_manual=len(list(set(text)))

## Tokenization (Tiktoken, char level)

#### Tiktoken

In [20]:
# Tokenize: Convert raw string into set of integers according to some vocabulary
# Small vocab means large set of integers and vice versa
enc = tiktoken.encoding_for_model("gpt-4")
print(enc.encode('hi there'))
print(enc.decode([6151,1070]))

[6151, 1070]
hi there


In [21]:
enc.n_vocab

100277

In [22]:
vocab_size_tiktoken = len(set(enc.encode(text)))
vocab_size_tiktoken

12111

In [23]:
max(set(enc.encode(text)))

100252

#### Manually build vocab and encode it

In [24]:
# All the unique characters that appear in the text
chars=sorted(list(set(text)))
print(''.join(chars))
vocab_size=len(chars)
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [25]:
stoi={}
for i,ch in enumerate(chars):
    stoi[ch]=i

itos={}
for i,ch in enumerate(chars):
    itos[i]=ch

encode = lambda s: [stoi[c] for c in s] #Creating a function to output a list of integers for encoding
decode = lambda s: ''.join([itos[c] for c in s]) #Creating a function to output a list of integers for decoding

In [26]:
print(encode('hi there'))
print(decode([46, 47, 1, 58, 46, 43, 56, 43]))

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


## Building torch tensor

In [27]:
# data=torch.tensor(encode(text),dtype=torch.long)
# print(data.shape,data.dtype)

data = torch.tensor(encode(text),dtype=torch.long)
print(data.shape,data.dtype)

torch.Size([1115394]) torch.int64


In [28]:
data[:10]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])

In [29]:
decode(data[:10].tolist())

'First Citi'

## Train Test Split

In [30]:
n=int(0.9*len(data))
train_data=data[:n]
test_data=data[n:]

In [31]:
block_size=8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

## Transformer Training
* While trainig we dont pass all the data at once, we train using block (or chunks of data). Context_length/block size
* Train it to predict at every position word
* In the context of 5451, 47317 comes next. In the context of 5451 and 47317, 512 comes next. So we have 8 positions.
* When we train we pass in multiple chunks of text stacked up, just so we keep GPUs busy (leverage parallel processing)


In [32]:
x=train_data[:block_size]
y=train_data[1:block_size+1]

for i in range(block_size):
    context = decode(x[:i+1].tolist())
    target = decode([y[i].tolist()])
    print(f''' Context: {context}    Target: {target} ''')


 Context: F    Target: i 
 Context: Fi    Target: r 
 Context: Fir    Target: s 
 Context: Firs    Target: t 
 Context: First    Target:   
 Context: First     Target: C 
 Context: First C    Target: i 
 Context: First Ci    Target: t 


In [33]:
batch_size =  4
block_size =  8
def get_batch(split):
    data = train_data if split=='train' else test_data
    ix = torch.randint(len(data)-block_size,(batch_size,))  #Gives us random indexes
    x = torch.stack([data[i:i+block_size] for i in ix]) #Get consecutive characters of block_size, for each batch
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) 
    return x, y

In [34]:
print(torch.randint(len(data)-block_size,(batch_size,)))
print(max(data.tolist()))
print(len(data))
print()

tensor([263419, 948456, 596075, 570489])
64
1115394



In [35]:
xb, yb=  get_batch('train')

print('Context',xb) #block_size is context
print('Trarget',yb)

for j in range(batch_size):
    for i in range(block_size):
        # context=enc.decode(list(chain.from_iterable([xb[i][:i+1]].tolist())))
        # target= enc.decode([yb[j,i].tolist()]) 
        print('Context:',decode(xb[j][:i+1].tolist()))
        #print('Target:',decode([yb[j][i]])) When using tiktoken
        print('Target:',decode([yb[j][i].tolist()]))


Context tensor([[ 1, 57, 47, 45, 46, 58, 11,  0],
        [ 1, 58, 46, 43,  1, 40, 59, 57],
        [46, 43, 52,  1, 53, 59, 56,  1],
        [46, 43,  1, 42, 47, 57, 42, 39]])
Trarget tensor([[57, 47, 45, 46, 58, 11,  0, 13],
        [58, 46, 43,  1, 40, 59, 57, 47],
        [43, 52,  1, 53, 59, 56,  1, 52],
        [43,  1, 42, 47, 57, 42, 39, 47]])
Context:  
Target: s
Context:  s
Target: i
Context:  si
Target: g
Context:  sig
Target: h
Context:  sigh
Target: t
Context:  sight
Target: ;
Context:  sight;
Target: 

Context:  sight;

Target: A
Context:  
Target: t
Context:  t
Target: h
Context:  th
Target: e
Context:  the
Target:  
Context:  the 
Target: b
Context:  the b
Target: u
Context:  the bu
Target: s
Context:  the bus
Target: i
Context: h
Target: e
Context: he
Target: n
Context: hen
Target:  
Context: hen 
Target: o
Context: hen o
Target: u
Context: hen ou
Target: r
Context: hen our
Target:  
Context: hen our 
Target: n
Context: h
Target: e
Context: he
Target:  
Context: he 
Ta

### Why we build models when we can leverage tik token for embeddings?
    * Embedding are not talking to themselves unless we train them. Currently they can just see themselves       

### Changes made to v2 of Bi-Gram model
- nn.Embedding(vocab_size, num_embeddings)
- so we will not get logits from self.token_embedding_table, we'll get token embeddings
- To go from token embeddings to logits we need a linear layer
- Get positional endcoding
- Add positional encoding and token embedding

In [36]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, num_embeddings) # Builds an embedding of vocab size with some weights initialized
        self.lm_head = nn.Linear(num_embeddings,vocab_size) #Linear layer to convert embeddings into vocab size
        self.position_embedding = nn.Embedding(block_size,num_embeddings) #For each block, we'll get position embedding

    def forward(self, idx, targets=None):
        B, T = idx.shape    
        # idx and targets are both (B,T) tensor of integers
        #logits = self.token_embedding_table(idx) # (B,T,C), Batch Time Channels: 4, 8, vocab size # Filtering embedding for specific indices, pluck out a row of token(integer) from embedding space
        tok_embeddings = self.token_embedding_table(idx) #(B,T,C) C = emdedding size
        pos_embedding = self.position_embedding(torch.arange(T)) #(T,C)
        x = tok_embeddings + pos_embedding
        logits = self.lm_head(x) #(B,T,Vocab_Size)

        if targets==None:
            loss=None
        else:
            B, T, C = logits.shape
            logits=logits.view(B*T, C)
            targets=targets.view(B*T) #Stretch out the tensor
            loss = F.cross_entropy(logits,targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        #idx is current B X T 
        for _ in range(max_new_tokens):
            idx_updated = idx[:,-block_size:] # Due to pos embedding we can never have more than block size
            logits, loss = self(idx_updated)
            logits = logits[:,-1,:] # Focusing on last character, this makes the model bi-gram model
            probs = F.softmax(logits, dim=-1) # Converting logits to prob
            idx_next = torch.multinomial(probs,num_samples=1)
            idx = torch.cat((idx,idx_next),dim=1)
        return idx


m = BigramLanguageModel(vocab_size_manual)
logits, loss = m(xb, yb)

In [37]:
print(loss)
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long),max_new_tokens=20)[0].tolist()))
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(4000): # increase number of steps for good results... 
    
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())

print(loss.item())
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=20)[0].tolist()))


tensor(4.6512, grad_fn=<NllLossBackward0>)

gYq:WWXfuQDak&y YXjp
4.469480037689209
4.456344127655029
4.371490478515625
4.352011680603027
4.360913276672363
4.408318519592285
4.334517478942871
4.35773229598999
4.390896320343018
4.324420928955078
4.326447486877441
4.275121688842773
4.2977986335754395
4.27813196182251
4.267948150634766
4.298491477966309
4.22177267074585
4.148321151733398
4.222843647003174
4.239603519439697
4.270653247833252
4.214629650115967
4.126403331756592
4.173102378845215
4.1600871086120605
4.160513401031494
4.105600357055664
4.1306352615356445
4.167458534240723
4.1413984298706055
4.062288761138916
4.041853427886963
4.059492588043213
3.992574691772461
4.07106351852417
4.080291748046875
4.002075672149658
4.0639424324035645
3.9826977252960205
3.957973003387451
4.046245098114014
3.9853479862213135
3.949650526046753
3.9036288261413574
3.8979005813598633
3.9800541400909424
3.884122848510742
3.8955624103546143
3.9999353885650635
3.9912776947021484
3.8702306747436523
3.85882

In [38]:
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long),max_new_tokens=20)[0].tolist()))


Tong, ch thas y tit!


## Self Attention
- Mathematical trick in self-attention

- B = Batch, T = Context, C = Embedding size

In [39]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2 #Embedding size
x = torch.randn(B,T,C)
print(x)
print(x.shape)

tensor([[[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]],

        [[ 1.3488, -0.1396],
         [ 0.2858,  0.9651],
         [-2.0371,  0.4931],
         [ 1.4870,  0.5910],
         [ 0.1260, -1.5627],
         [-1.1601, -0.3348],
         [ 0.4478, -0.8016],
         [ 1.5236,  2.5086]],

        [[-0.6631, -0.2513],
         [ 1.0101,  0.1215],
         [ 0.1584,  1.1340],
         [-1.1539, -0.2984],
         [-0.5075, -0.9239],
         [ 0.5467, -1.4948],
         [-1.2057,  0.5718],
         [-0.5974, -0.6937]],

        [[ 1.6455, -0.8030],
         [ 1.3514, -0.2759],
         [-1.5108,  2.1048],
         [ 2.7630, -1.7465],
         [ 1.4516, -1.5103],
         [ 0.8212, -0.2115],
         [ 0.7789,  1.5333],
         [ 1.6097, -0.4032]]])
torch.Size([4, 8, 2])


- Tokens (8) are not talking to each other, and a token can only look backward. Info flows from previous context to future.
- Easiest way to communicate (from perspective of token) is to average past information. That becomes feature vector that sumarized me in the context of history. 
- Sum is weak form of interaction, it's extremely lossy

In [40]:
xbow = torch.zeros((B,T,C))

### Version 1 (Calculating Average)

In [41]:
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1]
        xbow[b,t] = torch.mean(xprev,0)


In [42]:
x[0] #Input

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [43]:
xbow[0] # Mean

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

### Mathematical trick is that this calculation of mean can be done using code below

In [44]:
torch.manual_seed(42)
a=torch.tril(torch.ones(3,3))
a=a/torch.sum(a,1,keepdim=True)
b=torch.randint(0,10,(3,2)).float()
c = a @ b
print(a)
print(b)
print(c) #Mean of previous contexts

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


### Version 2 (Matrix Multiplication using @)

In [45]:
a = torch.tril(torch.ones(T,T))
a=a/torch.sum(a,1,keepdim=True)

In [46]:
xbow2 = torch.zeros((B,T,C))
xbow2= a @ x # (T,T) @ (B, T, C) ---> (B, T, T) @ (B,T,C), for each each it'll multiply T,T by T,C

In [47]:
torch.allclose(xbow2,xbow)

True

### Version 3 (Using softmax)

In [48]:
tril = torch.tril(torch.ones(T,T))
weights = torch.zeros(T,T)
weights = weights.masked_fill(tril==0,float('-inf'))
print(weights)
weights = F.softmax(weights,dim=-1) #1 for row wise operation, 0 for column wise
print(weights)
xbow3=weights @ x
print(xbow3)


tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
tensor([[[ 0.1

## Version 4
- Query = What am I looking for?
- Key = What do I contain?
- value = If you're interesting to me, here's the value
- Query dot products with all the keys, and dot product becomes weight. This is where communication comes into play
- weights are no longer independent/uniform
- Softmax will make sure, we are pulling more information from a token that has higher weight #Attention
- Cross Attention: Generate queries, and pull information from keys and values
- Scaled Dot-Product Attention: weights are fed into softmax, so we want those to be fairly difused. Otherwise softmax will be way too peaky. Scaling is used to control variance while initialization

In [49]:
torch.manual_seed(1337)
B,T,C = 4,8,32
x=torch.randn(B,T,C)

In [50]:
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k=key(x)
q=query(x)
v = value(x)
weights = q @ k.transpose(-2,-1) # (B,T,16) @ (B,16,T) -----> (B,T,T)

In [51]:
weights[0]

tensor([[-1.7629, -1.3011,  0.5652,  2.1616, -1.0674,  1.9632,  1.0765, -0.4530],
        [-3.3334, -1.6556,  0.1040,  3.3782, -2.1825,  1.0415, -0.0557,  0.2927],
        [-1.0226, -1.2606,  0.0762, -0.3813, -0.9843, -1.4303,  0.0749, -0.9547],
        [ 0.7836, -0.8014, -0.3368, -0.8496, -0.5602, -1.1701, -1.2927, -1.0260],
        [-1.2566,  0.0187, -0.7880, -1.3204,  2.0363,  0.8638,  0.3719,  0.9258],
        [-0.3126,  2.4152, -0.1106, -0.9931,  3.3449, -2.5229,  1.4187,  1.2196],
        [ 1.0876,  1.9652, -0.2621, -0.3158,  0.6091,  1.2616, -0.5484,  0.8048],
        [-1.8044, -0.4126, -0.8306,  0.5898, -0.7987, -0.5856,  0.6433,  0.6303]],
       grad_fn=<SelectBackward0>)

In [52]:
tril = torch.tril(torch.ones(T,T))
weights = weights.masked_fill(tril==0,float('-inf')) # Future words are not allowed to communicate. If we delete this, Encoder block.
weights = F.softmax(weights,dim=-1) #1 for row wise operation, 0 for column wise. Normalize because of negative weights
print(weights[0])
xbow4=weights @ v
print(xbow4[0])


tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)
tensor([[-0.1571,  0.8801,  0.1615, -0.7824, -0.1429,  0.7468,  0.1007, -0.5239,
         -0.8873,  0.1907,  0.1762, -0.5943, -0.4812, -0.4860,  0.2862,  0.5710],
        [ 0.6764, -0.5477, -0.2478,  0.3143, -0.1280, -0.2952, -0.4296, -0.1089,
         -0.0493,  0.7268,  0.7130, -0.1164,  0.3266,  0.3431, -0.0710,  1.2716],
        [ 0.4823, -0.1069, -0.4055,  0.1770,  0

### Scaled Attention Weights

In [53]:
weights = q @ k.transpose(-2,-1) * head_size**-0.5 #Scaled: To get difused values
print(weights[0])
tril = torch.tril(torch.ones(T,T))
weights = weights.masked_fill(tril==0,float('-inf')) # Future words are not allowed to communicate. If we delete this, Encoder block.
weights = F.softmax(weights,dim=-1) #1 for row wise operation, 0 for column wise. Normalize because of negative weights
print(weights[0])
xbow4=weights @ v
print(xbow4[0])

tensor([[-0.4407, -0.3253,  0.1413,  0.5404, -0.2668,  0.4908,  0.2691, -0.1132],
        [-0.8334, -0.4139,  0.0260,  0.8446, -0.5456,  0.2604, -0.0139,  0.0732],
        [-0.2557, -0.3152,  0.0191, -0.0953, -0.2461, -0.3576,  0.0187, -0.2387],
        [ 0.1959, -0.2004, -0.0842, -0.2124, -0.1401, -0.2925, -0.3232, -0.2565],
        [-0.3142,  0.0047, -0.1970, -0.3301,  0.5091,  0.2160,  0.0930,  0.2314],
        [-0.0782,  0.6038, -0.0276, -0.2483,  0.8362, -0.6307,  0.3547,  0.3049],
        [ 0.2719,  0.4913, -0.0655, -0.0789,  0.1523,  0.3154, -0.1371,  0.2012],
        [-0.4511, -0.1031, -0.2077,  0.1475, -0.1997, -0.1464,  0.1608,  0.1576]],
       grad_fn=<SelectBackward0>)
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3966, 0.6034, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3069, 0.2892, 0.4039, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3233, 0.2175, 0.2443, 0.2149, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.14

## Self Attention Head (Sub-classing)

### Hyper Parameters

In [54]:
head_size=16
num_embeddings=32

In [55]:
class Head(nn.Module):
    """ one head of self attention layer """
    def __init__(self, head_size):
        super().__init__()
        self.key=nn.Linear(num_embeddings,head_size,bias=False)
        self.query=nn.Linear(num_embeddings,head_size,bias=False)
        self.value=nn.Linear(num_embeddings,head_size,bias=False)
        self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))
    
    def forward(self,x):
        B,T,C=x.shape
        k=self.key(x)
        q=self.query(x)
        weights=q @ k.transpose(-2,-1) * C**-0.5
        weights = weights.masked_fill(self.tril[:T, :T]==0, float('-inf'))
        weights = F.softmax(weights,dim=-1)
        v=self.value(x)
        out = weights @ v
        return out

In [56]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size,num_embeddings)
        self.positional_embedding = nn.Embedding(block_size,num_embeddings)
        self.sa_head = Head(num_embeddings) #Constructor
        self.lm_head = nn.Linear(num_embeddings,vocab_size)

        def forward(self,idx,targets=None):
            B, T = idx.shape
            tok_embedding = self.token_embedding(idx)
            pos_embedding = self.positional_embedding(torch.arange(T))
            x = tok_embedding + pos_embedding
            x = self.sa_head(x)
            logits = self.lm_head(x)

            if targets is None:
                loss = None
            else:
                B, T, C = logits.shape
                logits = logits.view(B*T,C)
                targets = targets.view(B*T)
                loss = F.cros_entropy(logits,targets)

            return logits, loss
        
        def generate(self, idx, max_new_token):
            for _ in range(max_new_token):
                idx_context = idx[:,-block_size:]
                logits, loss = self(idx_context)
                logits = logits[:,-1,:]
                probs = F.softmax(logits,dim=-1)
                idx_next = torch.multinomial(probs, num_samples=1)
                idx = torch.cat([idx, idx_next],dim=1)
            return idx
        
model = BigramLanguageModel()


In [57]:
logits, loss = m(xb, yb)
print(loss)
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long),max_new_tokens=20)[0].tolist()))
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(4000): # increase number of steps for good results... 
    
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())

print(loss.item())
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=20)[0].tolist()))


tensor(2.4304, grad_fn=<NllLossBackward0>)

IAngrealo s the o tl
2.5001907348632812
2.446779489517212
2.6089789867401123
2.4678077697753906
2.4629228115081787
2.5546624660491943
2.510660171508789
2.475168228149414
2.4440133571624756
2.469287633895874
2.4038920402526855
2.402040719985962
2.3709256649017334
2.384351968765259
2.400057315826416
2.460076093673706
2.441765546798706
2.6365668773651123
2.3732516765594482
2.667811155319214
2.4769575595855713
2.388383388519287
2.438300371170044
2.545863151550293
2.46177077293396
2.335648536682129
2.415360927581787
2.466243267059326
2.5460896492004395
2.45493745803833
2.3612048625946045
2.4779245853424072
2.5149528980255127
2.4667410850524902
2.460874080657959
2.5323891639709473
2.5018887519836426
2.40777850151062
2.435865640640259
2.516979694366455
2.486631393432617
2.2959179878234863
2.464867353439331
2.408228874206543
2.4675991535186768
2.5777292251586914
2.514890193939209
2.5968196392059326
2.5113494396209717
2.537510395050049
2.4656081199645

## Multi Head Attention (Sub-classing)

In [60]:
num_head = 4
head_size = num_embeddings

In [61]:
class MultiHeadAttention(nn.Module):
    """ Multiple heads of self-attention at the same time (parallel) """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return out

In [62]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size,num_embeddings)
        self.positional_embedding = nn.Embedding(block_size,num_embeddings)
        self.sa_head = MultiHeadAttention(4,num_embeddings//4) #Constructor, dimensions /num_heads
        self.lm_head = nn.Linear(num_embeddings,vocab_size)

        def forward(self,idx,targets=None):
            B, T = idx.shape
            tok_embedding = self.token_embedding(idx)
            pos_embedding = self.positional_embedding(torch.arange(T))
            x = tok_embedding + pos_embedding
            x = self.sa_head(x)
            logits = self.lm_head(x)

            if targets is None:
                loss = None
            else:
                B, T, C = logits.shape
                logits = logits.view(B*T,C)
                targets = targets.view(B*T)
                loss = F.cros_entropy(logits,targets)

            return logits, loss
        
        def generate(self, idx, max_new_token):
            for _ in range(max_new_token):
                idx_context = idx[:,-block_size:]
                logits, loss = self(idx_context)
                logits = logits[:,-1,:]
                probs = F.softmax(logits,dim=-1)
                idx_next = torch.multinomial(probs, num_samples=1)
                idx = torch.cat([idx, idx_next],dim=1)
            return idx
        
model = BigramLanguageModel()


In [63]:
logits, loss = m(xb, yb)
print(loss)
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long),max_new_tokens=20)[0].tolist()))
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(40000): # increase number of steps for good results... 
    
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())

print(loss.item())
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=20)[0].tolist()))


tensor(2.4823, grad_fn=<NllLossBackward0>)

Angh n, os teateu, i
2.5148730278015137
2.5037457942962646
2.4023752212524414
2.51910400390625
2.315217971801758
2.439984083175659
2.480412006378174
2.547316789627075
2.5549817085266113
2.4816479682922363
2.4519951343536377
2.406818151473999
2.4232616424560547
2.5144143104553223
2.4414560794830322
2.5442087650299072
2.490612268447876
2.348327398300171
2.4988369941711426
2.390486240386963
2.458831310272217
2.3356435298919678
2.519507646560669
2.414645195007324
2.4130520820617676
2.3581736087799072
2.4398281574249268
2.4384095668792725
2.4893157482147217
2.5111920833587646
2.295835018157959
2.492094039916992
2.3793435096740723
2.445437431335449
2.4736921787261963
2.5052874088287354
2.4767332077026367
2.4940052032470703
2.4632699489593506
2.4631905555725098
2.3230979442596436
2.4137001037597656
2.326563596725464
2.5626132488250732
2.48193621635437
2.5058658123016357
2.5005829334259033
2.5128095149993896
2.567938804626465
2.5164120197296143
2.532

## Addind Non-Linearity Layer (Feed Forward/MLP) to improve on performance

In [64]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, num_embeddings):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(num_embeddings,num_embeddings),
            nn.ReLU()
        )

    def forward(self, x):
        return self.net(x)

In [65]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size,num_embeddings)
        self.positional_embedding = nn.Embedding(block_size,num_embeddings)
        self.sa_head = MultiHeadAttention(4,num_embeddings//4) #Constructor, dimensions /num_heads
        self.ffwl = FeedFoward(num_embeddings)
        self.lm_head = nn.Linear(num_embeddings,vocab_size)

        def forward(self,idx,targets=None):
            B, T = idx.shape
            tok_embedding = self.token_embedding(idx)
            pos_embedding = self.positional_embedding(torch.arange(T))
            x = tok_embedding + pos_embedding
            x = self.sa_head(x)
            x=self.ffwl(x) #Self attention gathers data, now each token needs to think on gethered data. FFWL is at token level
            logits = self.lm_head(x)

            if targets is None:
                loss = None
            else:
                B, T, C = logits.shape
                logits = logits.view(B*T,C)
                targets = targets.view(B*T)
                loss = F.cros_entropy(logits,targets)

            return logits, loss
        
        def generate(self, idx, max_new_token):
            for _ in range(max_new_token):
                idx_context = idx[:,-block_size:]
                logits, loss = self(idx_context)
                logits = logits[:,-1,:]
                probs = F.softmax(logits,dim=-1)
                idx_next = torch.multinomial(probs, num_samples=1)
                idx = torch.cat([idx, idx_next],dim=1)
            return idx
        
model = BigramLanguageModel()


In [66]:
logits, loss = m(xb, yb)
print(loss)
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long),max_new_tokens=20)[0].tolist()))
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(4000): # increase number of steps for good results... 
    
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())

print(loss.item())
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=20)[0].tolist()))


tensor(2.4631, grad_fn=<NllLossBackward0>)




ther, e osttinn a
2.3576393127441406
2.3644776344299316
2.4896316528320312
2.5599255561828613
2.384725332260132
2.5885391235351562
2.51776123046875
2.410203218460083
2.3905303478240967
2.4440183639526367
2.5589675903320312
2.496825933456421
2.525472402572632
2.403090000152588
2.412121295928955
2.494861125946045
2.3655622005462646
2.380577325820923
2.4818429946899414
2.434269905090332
2.4044203758239746
2.585071563720703
2.400428056716919
2.3552184104919434
2.4701974391937256
2.3806746006011963
2.380993127822876
2.4083235263824463
2.4597513675689697
2.44577956199646
2.4955389499664307
2.3348934650421143
2.5718767642974854
2.4544308185577393
2.539985418319702
2.4572970867156982
2.5474677085876465
2.417982578277588
2.449031352996826
2.5039923191070557
2.514106512069702
2.4353091716766357
2.480503797531128
2.3282968997955322
2.5426812171936035
2.4714019298553467
2.445408344268799
2.46502423286438
2.408963441848755
2.591815948486328
2.439187765

## Adding Layer to perform multi head attention several times

In [67]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, num_embeddings, num_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = num_embeddings // num_head
        self.sa = MultiHeadAttention(num_head, head_size)
        self.ffwd = FeedFoward(num_embeddings)

    def forward(self, x):
        x = self.sa(x)
        x = self.ffwd(x)
        return x

In [68]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size,num_embeddings)
        self.positional_embedding = nn.Embedding(block_size,num_embeddings)
        self.blocks = nn.Sequential(
            Block(num_embeddings,num_head),
            Block(num_embeddings,num_head),
            Block(num_embeddings,num_head),
            Block(num_embeddings,num_head)
        )
        self.lm_head = nn.Linear(num_embeddings,vocab_size)

        def forward(self,idx,targets=None):
            B, T = idx.shape
            tok_embedding = self.token_embedding(idx)
            pos_embedding = self.positional_embedding(torch.arange(T))
            x = tok_embedding + pos_embedding
            x = self.sa_head(x)
            x=self.ffwl(x) #Self attention gathers data, now each token needs to think on gethered data. FFWL is at token level
            logits = self.lm_head(x)

            if targets is None:
                loss = None
            else:
                B, T, C = logits.shape
                logits = logits.view(B*T,C)
                targets = targets.view(B*T)
                loss = F.cros_entropy(logits,targets)

            return logits, loss
        
        def generate(self, idx, max_new_token):
            for _ in range(max_new_token):
                idx_context = idx[:,-block_size:]
                logits, loss = self(idx_context)
                logits = logits[:,-1,:]
                probs = F.softmax(logits,dim=-1)
                idx_next = torch.multinomial(probs, num_samples=1)
                idx = torch.cat([idx, idx_next],dim=1)
            return idx
        
model = BigramLanguageModel()


In [69]:
logits, loss = m(xb, yb)
print(loss)
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long),max_new_tokens=20)[0].tolist()))
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(4000): # increase number of steps for good results... 
    
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())

print(loss.item())
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=20)[0].tolist()))


tensor(2.3963, grad_fn=<NllLossBackward0>)


S:
H:

Vicat ownd i
2.4699761867523193
2.5386221408843994
2.549417734146118
2.4646809101104736
2.4192025661468506
2.357398271560669
2.5470240116119385
2.4918713569641113
2.4001433849334717
2.585144519805908
2.366675615310669
2.4441452026367188
2.352658748626709
2.528549909591675
2.5191266536712646
2.3911261558532715
2.4600749015808105
2.4602162837982178
2.3826444149017334
2.5652859210968018
2.549144983291626
2.4342777729034424
2.465576410293579
2.4035484790802
2.376861333847046
2.4362664222717285
2.420112133026123
2.5330231189727783
2.410273313522339
2.3498494625091553
2.4909143447875977
2.467827558517456
2.395599365234375
2.3996307849884033
2.360292434692383
2.5348892211914062
2.405832052230835
2.5210373401641846
2.465214252471924
2.699907064437866
2.3596761226654053
2.424995183944702
2.60159969329834
2.3411331176757812
2.3077232837677
2.3688700199127197
2.533541202545166
2.4155426025390625
2.4072463512420654
2.498931884765625
2.55109286308

## Adding Residual & Layer Norm connections in the mix
- How's layer norm different from Batch norm. Layer norm happend along columns, whereas batch norm is along rows

In [70]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, num_embeddings, num_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = num_embeddings // num_head
        self.sa = MultiHeadAttention(num_head, head_size)
        self.ffwd = FeedFoward(num_embeddings)
        self.ln1 = nn.LayerNorm(num_embeddings)
        self.ln2 = nn.LayerNorm(num_embeddings)

    def forward(self, x):
        x = x+self.sa(self.ln1(x)) #Residual connection x+
        x = x+self.ffwd(self.ln2(x))
        return x

In [71]:
class MultiHeadAttention(nn.Module):
    """ Multiple heads of self-attention at the same time (parallel) """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_embeddings,num_embeddings)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out

In [72]:
dropout=0.2

In [209]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, num_embeddings):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(num_embeddings,4*num_embeddings), # 4 beacuse that's how in the paper
            nn.ReLU(),
            nn.Linear(4*num_embeddings,num_embeddings),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [73]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size,num_embeddings)
        self.positional_embedding = nn.Embedding(block_size,num_embeddings)
        self.blocks = nn.Sequential(
            Block(num_embeddings,num_head),
            Block(num_embeddings,num_head),
            Block(num_embeddings,num_head),
            Block(num_embeddings,num_head)
        )
        self.lm_head = nn.Linear(num_embeddings,vocab_size)

        def forward(self,idx,targets=None):
            B, T = idx.shape
            tok_embedding = self.token_embedding(idx)
            pos_embedding = self.positional_embedding(torch.arange(T))
            x = tok_embedding + pos_embedding
            x = self.sa_head(x)
            x=self.ffwl(x) #Self attention gathers data, now each token needs to think on gethered data. FFWL is at token level
            logits = self.lm_head(x)

            if targets is None:
                loss = None
            else:
                B, T, C = logits.shape
                logits = logits.view(B*T,C)
                targets = targets.view(B*T)
                loss = F.cros_entropy(logits,targets)

            return logits, loss
        
        def generate(self, idx, max_new_token):
            for _ in range(max_new_token):
                idx_context = idx[:,-block_size:]
                logits, loss = self(idx_context)
                logits = logits[:,-1,:]
                probs = F.softmax(logits,dim=-1)
                idx_next = torch.multinomial(probs, num_samples=1)
                idx = torch.cat([idx, idx_next],dim=1)
            return idx
        
model = BigramLanguageModel()


In [74]:
logits, loss = m(xb, yb)
print(loss)
print(decode(m.generate(torch.zeros((1,1), dtype=torch.long),max_new_tokens=20)[0].tolist()))
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for steps in range(10000): # increase number of steps for good results... 
    
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())

print(loss.item())
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=200)[0].tolist()))


tensor(2.4665, grad_fn=<NllLossBackward0>)

LI and Bu oviend ake
2.431062936782837
2.3889424800872803
2.4903175830841064
2.50071382522583
2.481351137161255
2.4524643421173096
2.5830066204071045
2.4423720836639404
2.5145955085754395
2.420123815536499
2.434779405593872
2.5226848125457764
2.394036293029785
2.4915456771850586
2.5232489109039307
2.3850514888763428
2.355703592300415
2.45180344581604
2.460207939147949
2.3460607528686523
2.409930467605591
2.4229493141174316
2.5467936992645264
2.407788038253784
2.4095852375030518
2.5175864696502686
2.408353090286255
2.3472936153411865
2.5125503540039062
2.462489604949951
2.42582368850708
2.4433670043945312
2.389559268951416
2.339508056640625
2.4712581634521484
2.5304007530212402
2.4072940349578857
2.4529004096984863
2.509228229522705
2.400325059890747
2.515897035598755
2.30935001373291
2.450500249862671
2.4181442260742188
2.558894634246826
2.4667859077453613
2.534528970718384
2.490715742111206
2.4568188190460205
2.455155372619629
2.481343507766