In [127]:
# imports
import torch
import torch.nn as nn
from torch.nn import functional as F

In [128]:
#loading master txt file with lyrics
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [129]:
print(len(text))
# print(text[:1000])

457793


# Bigram


In [130]:
# extracting vocabulary (tokens)
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"$&'(),-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyzéíïóе ​–—‘’” 
92


In [131]:
# writing encoder and decoder
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode('Taylor Swift'))
print(decode(encode('Taylor Swift')))

[44, 53, 77, 64, 67, 70, 1, 43, 75, 61, 58, 72]
Taylor Swift


In [132]:
# generating encoding for entire dataset
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)
# print(data[:1000])

torch.Size([457793]) torch.int64


In [133]:
# split into training and validation set
train_val_split_index = int(0.9*len(data))
train_data = data[:train_val_split_index]
val_data = data[train_val_split_index:]
print(train_val_split_index)

412013


In [134]:
# defining block size
block_size = 8
train_data[:block_size+1]

tensor([33,  1, 70, 57, 65, 57, 65, 54, 57])

In [135]:
# context and target (time dimension)
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'context: {context}, target: {target}')


context: tensor([33]), target: 1
context: tensor([33,  1]), target: 70
context: tensor([33,  1, 70]), target: 57
context: tensor([33,  1, 70, 57]), target: 65
context: tensor([33,  1, 70, 57, 65]), target: 57
context: tensor([33,  1, 70, 57, 65, 57]), target: 65
context: tensor([33,  1, 70, 57, 65, 57, 65]), target: 54
context: tensor([33,  1, 70, 57, 65, 57, 65, 54]), target: 57


In [136]:
torch.manual_seed(1337)
block_size = 8
batch_size = 4

def get_batch(split:bool):
    data = train_data if split else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb, yb = get_batch(True)
print('inputs')
print(xb.shape)
print(xb)
print('targets')
print(yb.shape)
print(yb)

print('------')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'context: {context.tolist()}, target: {target}')

inputs
torch.Size([4, 8])
tensor([[65, 67, 71, 72,  1, 53, 66, 77],
        [33,  1, 56, 67, 66,  6, 72,  1],
        [71, 77, 71, 72, 57, 65,  6, 71],
        [57,  1, 72, 60, 57,  1, 64, 73]])
targets
torch.Size([4, 8])
tensor([[67, 71, 72,  1, 53, 66, 77, 72],
        [ 1, 56, 67, 66,  6, 72,  1, 64],
        [77, 71, 72, 57, 65,  6, 71,  1],
        [ 1, 72, 60, 57,  1, 64, 73, 55]])
------
context: [65], target: 67
context: [65, 67], target: 71
context: [65, 67, 71], target: 72
context: [65, 67, 71, 72], target: 1
context: [65, 67, 71, 72, 1], target: 53
context: [65, 67, 71, 72, 1, 53], target: 66
context: [65, 67, 71, 72, 1, 53, 66], target: 77
context: [65, 67, 71, 72, 1, 53, 66, 77], target: 72
context: [33], target: 1
context: [33, 1], target: 56
context: [33, 1, 56], target: 67
context: [33, 1, 56, 67], target: 66
context: [33, 1, 56, 67, 66], target: 6
context: [33, 1, 56, 67, 66, 6], target: 72
context: [33, 1, 56, 67, 66, 6, 72], target: 1
context: [33, 1, 56, 67, 66, 6, 

In [137]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]   # (B, C)
            probs = F.softmax(logits, dim=1)    # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 92])
tensor(4.9328, grad_fn=<NllLossBackward0>)

?--KGt;ïTIExTBYw8s4-3?M’1w(c&IrTfJíqi0B?! 10K$Ed6nMdo)v,n?Je2Dp&sW;MzL
Z9jVpE”J;TN–Pfq ?BA ?w5B?gói(


In [138]:
# Pytorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [143]:
batch_size = 32
for steps in range(1000):
    xb, yb = get_batch(True)
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.4300053119659424


In [144]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))



Ié hock
AfVíve h5ht​2]
Thep, t be chmidóJfu w I ftJGith​4d llis nghthtightinLovió9é[”Pód,
I'zlancos at
Weve con'st ytt ve’r t "ODzGalimsoKiu tid t be iM6teddour dBis

I'ms,I' my I cod owo t e-Curs ise”2L2ve"
Iz ct on kowayom m hel "
Aved--reU
Yo t cicoome cameyou 0I jïDI nouprd bromit ystil ouhair c]at?

Busck"trn'pumwaraï—‘Rofo, (Dl8ve pre s ore
Jd hitrs ow
IrT yotit Lorither ankyOoMndPKBuinengw f felberschist fongelfu aslks, by


Myeenaryod)
I usl 1Sinf Ser V5ve d upesemy”0Pl c r sedfou e mih


# Self Attention

In [145]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [146]:
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [147]:
# version 1: x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b,t] = torch.mean(xprev,0)

print(x[0])
print(xbow[0])

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])
tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])


In [148]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)
# print(xbow[0])
# print(xbow2[0])

False

In [149]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

False

In [151]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
#out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [126]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [152]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [153]:
k.var()

tensor(1.0449)

In [154]:
q.var()

tensor(1.0700)

In [155]:
wei.var()

tensor(1.0918)

In [156]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [157]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1) # gets too peaky, converges to one-hot

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [158]:
class LayerNorm1d: # (used to be BatchNorm1d)

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [159]:
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [160]:
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0000))