In [None]:
# for text wrap in colab window

from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
# required to use torchtext
pip install portalocker

Collecting portalocker
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2


In [None]:
import re
from torchtext.datasets import WikiText2
import portalocker.utils as portalocker

# Get data

In [None]:
# train, validation and test split
training, validation, testing = WikiText2(root = '.data', split = ('train', 'valid', 'test'))

In [None]:
# Text cleaning:
# Wikitext comes with some defined symbol notations, e.g. @-@ denotes a hyphen, @.@ denotes a decimal point
# I have normalized the above into regular symbols.
# <unk> words have been retained as removal caused loss of contextual information

text1 = ""
text2 = ""
text3 = ""

def text_getter(dpipe):
    text = ""
    for i in dpipe:
        # tmp = i.replace("<unk>", "")
        # tmp = tmp.replace("<formula>", "")
        tmp = i.replace("@.@", " . ")
        tmp = tmp.replace("@-@", " - ")
        tmp = tmp.replace("@,@", " , ")
        tmp = re.sub(r'[–—-]', '-', tmp)
        tmp = re.sub(r'[•・･]', '•', tmp)
        tmp = re.sub(r'[“”]', '"', tmp)
        tmp = re.sub(r'[‘’]', "'", tmp)
        pattern = r'[^\w\-•:},\[){(°₹£$¥₡€～′″\'",+＝*÷%|…!.∕;@\\?&_ \s]'
        tmp = re.sub(pattern, '', tmp)
        tmp = re.sub(r'(?<!\n) +| +(?=\n)|(?<=\n) +', ' ', tmp)

        text+=tmp
        del tmp
    return text


text1 = text_getter(training)
text2 = text_getter(validation)
text3 = text_getter(testing)
del training, validation, testing


# Using train + validation data as training data and test as validation data.
# There is no "test" data as the aim is to create an auto regressive text generator.
train = text1 + text2
valid = text3

full_text = train + valid

del text1, text2, text3

# Tokenize and encode text

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from tokenizers import ByteLevelBPETokenizer

In [None]:
# creating a tokenizer object and fitting to train data with maximum tokens as 500 (vocabulary size)
# larger vocabulary sizes (>=1000) can learn more nuanced relationships between words but is prone to overfitting
# whereas the opposite, i.e. smaller vocab sizes (<100) cannot learn contextual information properly (underfitting)
tokenizer = ByteLevelBPETokenizer()
vocab_size = 500
tokenizer.train_from_iterator([full_text], vocab_size=vocab_size, min_frequency=2)

In [None]:
# converting words to numbers using above tokenizer
train_data = torch.tensor(tokenizer.encode(train).ids, dtype=torch.long)
valid_data = torch.tensor(tokenizer.encode(valid).ids, dtype=torch.long)

# Data modelling

In [None]:
block_size = 128    # context length
batch_size = 128    # number of batches of context
n_embd = 512        # embedding dimensions of each token
ma_head = 8         # no. attention heads for each token
n_blocks = 4        # no. of blocks of attention + feed-forward
eval_iters = 100    # no. of random checks to estimate average loss
learning_rate = 3e-3
max_iters = 6000
eval_interval = 500
device = "cuda" if  torch.cuda.is_available() else "cpu"
dropout = 0.2

In [None]:
torch.manual_seed(9)


# The entire context of "block_size" is split into its constituent contexts, as in token t1 (x) is context for t2 (y)
# [t1, t2] (x) are context for t3 (y) and so on, generating block_size - 1 contexts. This is a single batch.
# get_batch() extracts a segment of text from random locations and returns a batch of x and y
def get_batch(split):

    data = train_data if split=="train" else valid_data
    indx = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i: i+block_size] for i in indx])
    y = torch.stack([data[i+1: i+block_size+1] for i in indx])
    x, y = x.to(device), y.to(device)
    return x, y


# loss etimation from multiple number of randomly chosen contexts
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ["train", "valid"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out



# class Head() performs the scaled-masked attention over a single head
# masked - to consider information only from the past until current position
# scaled - to provide a diffused input for the softmax function, so that it doesn't peak at a single point (keeping the system probabilistic)
class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        # generating keys & queries of head_size
        k = self.key(x)
        q = self.query(x)

        # wei determines the importance to be given to each token based on key & query
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        # an abstracted representation of the original values
        v = self.value(x)
        out = wei @ v

        return out


# to perform the  attention from above over multiple instances
class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)   # projection into residual path
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


# a feed forward block between the attention block and the output
class FeedForward(nn.Module):

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd),    # projection into residual path
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)


# creating a composite block of (attention + feed forward) for repetition
class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x



class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # embedding each token to n_embd dimensions
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # embedding each position in context length to n_embd dimensions
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=ma_head) for _ in range(n_blocks)])
        self.ln_f = nn.LayerNorm(n_embd)
        # the token embedding table essentially serving as the logits via a linear layer
        self.lm_head = nn.Linear(n_embd, vocab_size)


    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_embd = self.token_embedding_table(idx)
        pos_embd = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_embd + pos_embd
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets==None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    # generate functionfor
    def generate(self, idx, max_new_tokens):

        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


model = BigramLanguageModel(vocab_size)
model = model.to(device)

# weight decay parameter adds L2 regularization
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
# learning rate scheduler steps down learning rate
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=300, gamma=0.8)

for iter in range(1, max_iters+1):

    if iter==1 or iter%eval_interval==0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['valid']:.4f}")

    elif iter%100==0:
        print(f"step {iter}")

    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step 1: train loss 6.3845, val loss 6.3910
step 100
step 200
step 300
step 400
step 500: train loss 2.6930, val loss 2.6746
step 600
step 700
step 800
step 900
step 1000: train loss 2.4551, val loss 2.4772
step 1100
step 1200
step 1300
step 1400
step 1500: train loss 2.3309, val loss 2.3839
step 1600
step 1700
step 1800
step 1900
step 2000: train loss 2.2447, val loss 2.3187
step 2100
step 2200
step 2300
step 2400
step 2500: train loss 2.1866, val loss 2.2850
step 2600
step 2700
step 2800
step 2900
step 3000: train loss 2.1391, val loss 2.2547
step 3100
step 3200
step 3300
step 3400
step 3500: train loss 2.1051, val loss 2.2368
step 3600
step 3700
step 3800
step 3900
step 4000: train loss 2.0738, val loss 2.2255
step 4100
step 4200
step 4300
step 4400
step 4500: train loss 2.0473, val loss 2.1972
step 4600
step 4700
step 4800
step 4900
step 5000: train loss 2.0209, val loss 2.1912
step 5100
step 5200
step 5300
step 5400
step 5500: train loss 2.0079, val loss 2.1811
step 5600
step 5700


In [None]:
# generating new tokens starting with 0 index token as input
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(tokenizer.decode(model.generate(context, max_new_tokens=200)[0].tolist()))

! a superiority of human City , court - the court is the best of shoping in fourth century and installment tourists in the ethnic public . This event was was turned to complete reporter , focusing its character in vein , better and insane and not onward . The creation of the leldest of Croatian depict Morning program . A widow of the castrop , he substances he banned the past . I you know , but you tokin the conspiracy as a human shall like , Italy


In [None]:
# generating new tokens starting with random index token as input
context = torch.randint(0, 500, (1, 1), dtype=torch.long, device=device)
print(tokenizer.decode(model.generate(context, max_new_tokens=200)[0].tolist()))

lection of the Winhes . These pure companies added tropical charges by a rain at grasses combined for making markets , and the tracker 's Day Sisters ' Tide television cents . In the seventh street group of the foot , Miss Soky Railway , follows the merge in trap into it in the wind . About 110 people of Meyer - Germany , helps unk the unk Missouri push hospital the Yugoslav Bridge . The Flower Bridgeon is


In [None]:
torch.save(model.state_dict(), "/content/llmodel.pt")

In [None]:
tokenizer.save("/content/bpe.tokenizer.json")