In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from matplotlib import pyplot as plt

# EDA

In [2]:
text = open("Dataset/shakesphere.txt").read()

In [3]:
vocab = sorted(set(text))
print(f"Vocabulary : {vocab}")
print(f"Vocab size : {len(vocab)}")

Vocabulary : ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
Vocab size : 65


# Dataset Preprocessing

In [4]:
stoi = {ch:i+1 for i, ch in enumerate(vocab[1:])}
itos = {i+1:ch for i, ch in enumerate(vocab[1:])}
stoi['\n'] = 0
itos[0] = '\n'
encode = lambda text: [stoi[ch] for ch in text]
decode = lambda ix: ''.join([itos[i] for i in ix])
text = torch.tensor(encode(text))
n1 = int(0.9 * len(text))
train_data = text[:n1]
val_data = text[n1:]

In [5]:
def get_batch(mode):
    data = train_data if mode == "train" else val_data
    batch_size = 4
    block_size = 8
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i : i+block_size] for i in ix])
    y = torch.stack([data[i+1 : i+block_size+1] for i in ix])
    return x, y

# Bigram Model Building

In [211]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, ix, targets = None):
        logits = self.token_embedding_table(ix)
        
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, ix, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(ix)
            logits = logits[:, -1, :]
            p_dis = F.softmax(logits, dim = -1)
            next_ix = torch.multinomial(p_dis, num_samples = 1)
            ix = torch.cat((ix, next_ix), dim = -1)
        return ix

In [212]:
torch.manual_seed(1337)
vocab_size = 65
model = BigramLanguageModel()
print(f"Parameter count : {sum([p.nelement() for p in model.parameters()])}")
print(decode(model.generate(torch.zeros((1, 1), dtype = torch.long), 100).tolist()[0]))

Parameter count : 4225

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [213]:
#Train
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.03)
for i in range(1000):
    x, y = get_batch("train")
    logits, loss = model(x, y)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()
    if i%100 == 0:
        print(loss)

tensor(4.5648, grad_fn=<NllLossBackward0>)
tensor(3.4341, grad_fn=<NllLossBackward0>)
tensor(2.7348, grad_fn=<NllLossBackward0>)
tensor(2.8343, grad_fn=<NllLossBackward0>)
tensor(2.6273, grad_fn=<NllLossBackward0>)
tensor(2.6538, grad_fn=<NllLossBackward0>)
tensor(2.6867, grad_fn=<NllLossBackward0>)
tensor(2.4891, grad_fn=<NllLossBackward0>)
tensor(2.5951, grad_fn=<NllLossBackward0>)
tensor(2.7333, grad_fn=<NllLossBackward0>)


In [214]:
torch.manual_seed(1337)
print(decode(model.generate(torch.zeros((1, 1), dtype = torch.long), 100).tolist()[0]))


JOHul avere co.
CouCKI:
HABe y, crd wh tarrisoth
Y werariathitoth lll d wnf thye m f fQ's ve PRine f


In [215]:
#Evaluation
with torch.no_grad():
    for i in range(1000):
        x, y = get_batch("eval")
        logits, loss = model(x, y)
        if i%100 == 0:
            print(loss)

tensor(2.4673)
tensor(2.6564)
tensor(2.7202)
tensor(2.3033)
tensor(2.2906)
tensor(2.2546)
tensor(2.3078)
tensor(2.4898)
tensor(2.3975)
tensor(2.5163)


# Transformer Model Building

In [6]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        q = self.query(x)
        k = self.key(x)
        v = self.value(x)
        att_sc = q @ k.transpose(-2, -1)
        att_sc = att_sc.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        att_sc = F.softmax(att_sc, dim = -1)
        out = att_sc @ v
        return out


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        
    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim = -1)


class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd),
            nn.ReLU()
        )

    def forward(self, x):
        return self.net(x)
        

class Block(nn.Module):
    def __init__(self, n_heads):
        super().__init__()
        head_size = n_embd//n_heads
        self.sa_head = MultiHeadAttention(n_heads, head_size)
        self.ffwd = FeedForward()

    def forward(self, x):
        x = self.sa_head(x)
        x = self.ffwd(x)
        return x


class Transformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            Block(4),
            Block(4),
            Block(4),
            Block(4)
        )
        self.lm_head = nn.Linear(n_embd, vocab_size, bias = False)

    def forward(self, x, targets = None):
        B, T = x.shape
        tok_emb = self.token_embedding_table(x)
        pos_emb = self.position_embedding_table(torch.arange(T))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        logits = self.lm_head(x)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, vocab_size)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, ix, max_new_tokens):
        out = ix
        for _ in range(max_new_tokens):
            logits, loss = self(ix)
            logits = logits[:, -1, :]
            p_dis = F.softmax(logits, dim = -1)
            next_ix = torch.multinomial(p_dis, num_samples = 1)
            if ix.shape[1] < block_size:
                ix = torch.cat((ix, next_ix), dim = -1)
            else:
                ix = torch.cat((ix[:, 1:], next_ix), dim = -1)
            out = torch.cat((out, next_ix), dim = -1)
        return out

In [7]:
@torch.no_grad()
def estimate_loss():
    model.eval()
    out = {}
    for split in ["train", 'val']:
        losses = torch.zeros(eval_iters)
        for i in range(eval_iters):
            x, y = get_batch(split)
            logits, loss = model(x, y)
            losses[i] = loss.item()
        out[split] = losses.mean().item()
    model.train()
    return out

In [8]:
torch.manual_seed(1337)
block_size = 8
vocab_size = 65
n_embd = 32

model = Transformer()
print(f"Parameter count : {sum([p.nelement() for p in model.parameters()])}")
print(decode(model.generate(torch.zeros(1,1, dtype = torch.long), 100)[0].tolist()))

Parameter count : 20928

iZWcKRDegcgzQamM.FXwHUL$..sLv'vuwajJf Pj$YG;uMWK;$T,H-!VBEK!BshNTIsbhGpkxZ?,BGml!x?oUM-L&r:Q'rMOOHZS


In [9]:
#Train
max_iters = 5000
eval_iters = 200
eval_interval = 500
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3)
for i in range(max_iters):
    if i%eval_interval == 0:
        losses = estimate_loss()
        print(f"step {i}   : train_loss : {losses["train"]:2f}    val_loss : {losses["val"]:2f}")
    x, y = get_batch("train")
    logits, loss = model(x, y)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

step 0   : train_loss : 4.168072    val_loss : 4.169550
step 500   : train_loss : 3.369318    val_loss : 3.369343
step 1000   : train_loss : 3.448972    val_loss : 3.504546
step 1500   : train_loss : 3.345427    val_loss : 3.357195
step 2000   : train_loss : 3.329371    val_loss : 3.383845
step 2500   : train_loss : 3.321426    val_loss : 3.392511
step 3000   : train_loss : 3.292060    val_loss : 3.382539
step 3500   : train_loss : 3.326870    val_loss : 3.355074
step 4000   : train_loss : 3.339852    val_loss : 3.324258
step 4500   : train_loss : 3.321361    val_loss : 3.367865


In [282]:
torch.manual_seed(1337)
out = decode(model.generate(torch.zeros(1,1, dtype = torch.long), 100)[0].tolist())
print(out)


Juoslc hheh co.
CouChn: LAor yi crh wo ca wiroe 
Ynodraoia,g;t te lll dlen
It,ne hon ftoihv E niae f


In [278]:
with torch.no_grad():
    loss_ls = []
    for i in range(1000):
        x, y = get_batch("eval")
        logits, loss = model(x, y)
        loss_ls.append(loss.item())
        if i%100 == 0:
            print(loss)

tensor(2.4404)
tensor(2.5505)
tensor(2.6389)
tensor(2.0694)
tensor(2.1598)
tensor(2.0800)
tensor(2.3222)
tensor(2.3897)
tensor(2.4044)
tensor(2.5176)
