## Log

Bigram: train 2.3679




In [70]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [2]:
txt = open('shakespeare.txt', 'r').read()
len(txt)

5447119

In [67]:
txt[0:500]

"                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bud"

In [73]:
chars = list(set(txt))
chars.sort()

ctoi = {c:i for i, c in enumerate(chars)}
itoc = {i:c for i, c in enumerate(chars)}
vocab_size = len(chars)

print("".join(chars))
print(vocab_size)


 !"&'(),-.0123456789:;<>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz|}
84


In [72]:
i =  math.floor(0.9 * len(txt))
train_txt = txt[0:i]
valid_txt = txt[i+1:]

len(train_txt), len(valid_txt)

(4902407, 544711)

In [76]:
train_tkns = [ctoi[c] for c in train_txt]
valid_tkns = [ctoi[c] for c in valid_txt]

In [78]:
block_size = 8
batch_size = 32

def txt_to_token(t):
    return [ctoi[c] for c in t]
    
# (B, L)
def random_batch():
    xi = torch.randint(0, len(train_tkns)-block_size, (batch_size,))
    x = torch.tensor([train_tkns[i:i+block_size] for i in xi])
    y = torch.tensor([train_tkns[i+1:i+block_size+1] for i in xi])
    
    return x, y

x, y = random_batch()
x.shape

torch.Size([32, 8])

In [7]:
x[0]

tensor([ 1, 31, 73, 56, 69, 58, 60,  8])

In [21]:
class BigramModel(nn.Module):    
    
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, vocab_size)

    # (B, L) -> (B, L, C)
    def forward(self, x):
        y = self.embed(x)
        return y
            
model = BigramModel()
optim = torch.optim.Adam(model.parameters())
model(x).shape

torch.Size([32, 8, 84])

In [94]:
def eval_split(split, model):
    split = "train"

    tkn = train_tkns if split == "train" else valid_tkns
    tkn = torch.tensor(tkn)

    bsize = len(tkn) // block_size
    x = tkn[0:bsize*block_size]
    y = tkn[1:bsize*block_size+1]
    x = x.view(bsize, block_size) # (B, L)
    y = y.view(bsize, block_size) # (B, L)
    
    logits = model(x) # (B, L, C)
    B, L, C = logits.shape
    loss = F.cross_entropy(logits.view(B*L, C), y.view(B*L))
    
    return loss.item()

In [22]:
def sample():
    max_len = 500
    tks = [0]*block_size

    for i in range(max_len):
        ctx = torch.tensor(tks[i:i+block_size]) # (L)
        ctx = ctx.view(1, -1) # (B, L)

        logits = model(ctx) # (B, L, C)
        probs = F.softmax(logits, dim=2) # (B, L, C)
        probs = probs[0,-1,:] # (C), # the last in the sequence is the newly generated
        yi = torch.multinomial(probs, 1)
        tks.append(yi.item())

    chars = [itoc[t] for t in tks]
    return "".join(chars)

In [23]:
print(sample())









(U`p!HF3yYP5<<gS,3(V!Xsh
&
krziPoKoKUOKQP[a7-tK6NO2aAK;'l[?(LYhyJyU7jrj<,W1W FG_;S< :`X]69Wq)})0}FriwX3sYa)jK8;1D,(xaF52Y
2f(]>ar 5((eH`DJyp}(:nC0Mdpvgy(mVpz
_0?dQM!i!WpG&
o|6!j(oeOh>aqBucl`&'Wxt]>FGd'e)m8.LpYT [Vvg&o;uw72W|knl8&aZEe[b74p]>49y?COP>G,QYS ulf_9uOnZwbY>eaI!]CZpV1RBrO0]Uqqt4kMXR|g-;&bfNt]dkjTCIVln[
1b0UUDKoIQ8<|XgZtuOyvY[iXfsgy)c; wQ4R`Eh<N3<-Rx5E:Vw3yxzl8d2Pr0GDt]
T;S}ehHNrjX1b]>7bj;tvGXiZRQ?;J&f`<NduS&}V7biZ:tn648UB:-A24E-k0scYTWJ -
R9r&;eCEstbN7mpcA1xcZi0x U[cYPS|a0-ZtI]_VucYTC07


In [26]:
for i in range(50000):
    optim.zero_grad()

    xb, yb = random_batch()
    logits = model(xb) # (B, L, C)

    B, L, C = logits.shape
    loss = F.cross_entropy(logits.view(B*L, C), yb.view(B*L))
    
    loss.backward()
    optim.step()
    
    if i % 5000 == 0:
        print(f"{loss.item():.4f}")

2.4246
2.4870
2.4671
2.4864
2.3995
2.3815
2.4440
2.4851
2.5340
2.3679


In [95]:
tr_loss = eval_split("train", model)
va_loss = eval_split("valid", model)

print(f"train: {tr_loss:.4f}")
print(f"valid: {va_loss:.4f}")
print(f"baseline: {-torch.tensor(1/vocab_size).log():.4f}")

train: 2.4716
valid: 2.4716
baseline: 4.4308


In [25]:
print(sample())









 s      layoukeconce s hyofor, IDEPe mem, ack'sour  mu  LERCO.
  yot  As? Fawincuiote beado o sticldiro SAy 
 eanorntemechorthininghe,
   BE  
 ilare  t ppr    QUE  MAy  h  m But,
  I    id
  ver.
 FRThesel oo cenouserecthane  t   henghes; emp  TORDALExelth S.  y.
  Moues petze  ofome bendoknd po  bofie  ande    n HI t. ad gafuchy HAn feach is stoull.
   CThe rtede f  mo tizo hiemisthe EDowith y-el  le wing IZESAnspit    rt hrilshee t t sp o   k  s thoulthast,  h ou INo  HARI Oflon'dirisor   d!



In [56]:
# Attension mechanism

head_size = 5
x = torch.rand(4, block_size, vocab_size) # (B, L, C)

key = nn.Linear(vocab_size, head_size)
query = nn.Linear(vocab_size, head_size)
value = nn.Linear(vocab_size, head_size)

k = key(x)   # (B, L, C)
q = query(x) # (B, L, C)
v = value(x) # (B, L, C)

q = q.permute(0, 2, 1) # (B, C, L)
w = k @ q # (B, L, L)

B, L, C = k.shape
mask = torch.tril(torch.ones(L, L))
mask = mask == 0
w = w.masked_fill(mask, -float('inf'))

prob = F.softmax(w, dim=2) # (B, L, L) TODO: not sure which dimension
a = prob @ v # (B, L, L) @ # (B, L, C)
a.shape # (B, L, C)

torch.Size([4, 8, 5])

In [65]:
class Attention(nn.Module):    
    
    def __init__(self, head_size):
        super().__init__()
        
        self.head_size = head_size
        self.key = nn.Linear(vocab_size, head_size)
        self.query = nn.Linear(vocab_size, head_size)
        self.value = nn.Linear(vocab_size, head_size)
        
    # (B, L, C)  ->  (B, L, C')  
    def forward(self, x):
        k = self.key(x)   # (B, L, C')
        q = self.query(x) # (B, L, C')
        v = self.value(x) # (B, L, C')
        
        # TODO: other way than permute?
        q = q.permute(0, 2, 1) # (B, C', L)
        w = k @ q # (B, L, L)
        w /= self.head_size**0.5
        
        B, L, C = k.shape
        mask = torch.tril(torch.ones(L, L))
        mask = mask == 0
        w = w.masked_fill(mask, -float('inf'))

        prob = F.softmax(w, dim=2) # (B, L, L)
        a = prob @ v # (B, L, C')

        return a

In [66]:
x = torch.rand(4, block_size, vocab_size)
att = Attention(6)
att(x).shape

torch.Size([4, 8, 6])

In [None]:
hidden_size = 100

class Transformer(nn.Module):    
    
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, hidden_size)

    # (B, L) -> (B, L, C)
    def forward(self, x):
        y = self.embed(x)
        return y
            
model = BigramModel()
optim = torch.optim.Adam(model.parameters())
model(x).shape