# **Building a word level language model**

In [4]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
import re
import os 
torch.manual_seed(1337)

ModuleNotFoundError: No module named 'torch'

In [2]:
# HYPERPARAMETERS

batch_size = 128
block_size = 256 # window length
n_embd = 384 # dimension of embedding vector
n_head = 6 # number of heads in multi_head attention
n_layers = 6 # number of decoder layers Nx
dropout_rate = 0.3
eval_iters = 200 # take average of eval_iters output while evaluating
eval_interval = 500 # evaluate every eval_interval
learning_rate = 3e-4
train_ratio = 0.9 # train-test-split

prev_model = False
if prev_model:
    checkpoint_path = "/kaggle/working/checkpoint-2.pth.tar"

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
# books = os.listdir("/kaggle/input/bookcropus/books1/epubtxt")[:6]
# print(books)

['jataka-tales-volume-5.epub.txt', 'diligent-ambassadors.epub.txt', 'holiday-romance-short-stories-book-two.epub.txt', 'reclaiming-your-soul-working-with-the-mystery.epub.txt', 'high-council-files-nickys-tale.epub.txt', 'hearts-in-darkness.epub.txt']


In [6]:
# text = ""
# for book in books:
#     with open("/kaggle/input/bookcropus/books1/epubtxt/"+book, 'r', encoding='utf-8') as f:
#         a_book = f.read()
#         a_book = a_book.lower()
#     text += a_book

text = ""
with open("datasets/tiny-shakespeare.txt", 'r', encoding='utf-8') as f:
    a_book = f.read()
    a_book = a_book.lower()
text += a_book

In [8]:
print(f"Number of characters: {len(text)}")
print(f"Number of words: {len(text.split())}") #number of words

Number of characters: 1115394
Number of words: 202651


In [9]:
vocab_list = re.findall(r'\w+|[^\w\S]+|[^\w]', text) # tokenizing
vocab_list = sorted(list(set(vocab_list)))
vocab_size = len(vocab_list)
print(f"Vocab size: {vocab_size}")

Vocab size: 11473


In [10]:
stoi = {value: key for key, value in enumerate(vocab_list)}
itos = {key: value for value, key in stoi.items()}
encode = lambda x: [stoi[_] for _ in x]
decode = lambda x: ''.join(itos[_] for _ in x)

In [11]:
data = torch.tensor(encode(re.findall(r'\w+|[^\w\S]+|[^\w]', text)))
print(data.shape)
print(data[:100])

NameError: name 'torch' is not defined

## **Train-Test split**

In [None]:
n = int(train_ratio * len(data))
x_train = data[:n]
x_test = data[n:]

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, x):
        super(CustomDataset, self).__init__()
        self.x = x
        
    def __len__(self):
        return self.x.shape[0] - block_size
    
    def __getitem__(self, idx):
        return self.x[idx:idx+block_size], self.x[idx+1:idx+1+block_size]

In [None]:
def prepare_dataloader(dataset, batch_size):
    dataloader = torch.utils.data.DataLoader(dataset=CustomDataset(dataset),
                                            batch_size=batch_size,
                                            shuffle=True,
                                            pin_memory=False)
    return dataloader

# **Self-Attention**
* ### Many ways to implement self-attention.
* ### Masking needs to be implemented which generally means we don't look at the future tokens for prediction (just look at the past         tokens)
* ### The simplest idea to determine the affinity/attention is to take an average of all the past tokens in each channel.
* ### The best way is to use the softmax as mentioned in the paper

In [None]:
class Head(nn.Module):
    # ONE HEAD OF SELF-ATTENTION
    
    def __init__(self, head_size):
        # The head size is denoted by dk in the paper
        # The dimension of head size is generally equal to the dimension of the embedding vector
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))
                             # Registers a buffer that is not considered a model parameter
                            # Here tril isn't a model parameter to learn. so we register it as a buffer
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, x):
        B, T, C = x.shape # C is equal to the head size
        k = self.key(x) # gives output (B, T, C)
        q = self.query(x) # output of (B, T, C)
        v = self.value(x) # output of (B, T, C)
        wei = q @ k.transpose(-2, -1) # (B, T, C) @ (B, C, T) --> (B, T, T) this (B, T, T) represents the amount of affinity each token has
                                        # with other tokens defined inside that block_size(window)
        wei = wei / C**0.5 # normalizing the weights; controls the variance; for more explanation look in the rough section
        tril = torch.tril(torch.ones((T, T), device=device))
        wei = wei.masked_fill(tril == 0, float('-inf'))
        wei = F.softmax(wei, -1)
        wei = self.dropout(wei)
        out = wei @ v
        
        return out

# **Multi-Head Attention**

* ### The idea is to simply concatenate number of self-     attention heads.
* ### The output dimension after concatenation is equal to   n_embd(or the input to the multi-head attention layer).
* ### There is also a projection layer(a linear layer) after multi-head attention layer. The projection layer is required as we implement skip-connections. because the output of the multi-head attention adds up to the residual pathway. So first we perform projection and then add the output to the residual pathway. (mentioned in the paper)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.projection(out))
        return out

# **Feed Forward Network**

* ### The implementation of feed forward in transformer is on the per token basis(i.e. all the tokens do this independently.
* ### The self-attention is the communication between the tokens and once they have gathered all the data they have to think on the data          independently which is done by FFN.
* ### FFN, too, contains the projection layer before adding up to the residual pathway.
* ### " This consists of two linear transformations with a ReLU activation in between." (as mentioned in the paper)
* ### Note that the dimension of the inner layer is multiplied by 4 as mentioned in the paper. "The dimensionality of input and output is dmodel = 512, and the inner-layer has dimensionality dff = 2048." (2048/512=4)

In [None]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd), # the projection layer
            nn.Dropout(dropout_rate)
        )
        
    def forward(self, x):
        return self.network(x)

# **Implementation of block**
* ### We are implementing decoder block in this project to be more specific.
* ### There are *Nx* number of blocks as mentioned in the paper.
* ### Intersperse communication with computation(meaning a block contains attention(communication) and FFN(computation)) and replicates the   same for *Nx* number of times.
* ### Skip-connection implemented for both attention sub-layer and FFN sub-layer.
* ### There is also the implementation of layer norm which is applied before feeding the inputs into the sub-layer blocks. The original   paper performs layer norm to the output of the sub-layers.
* ### Also added dropout for both sub-layers before adding to the residual pathway.

In [None]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        # n_head gives the number of heads for multi-head attention
        super().__init__()
        head_size = n_embd // n_head # gives the output dimension of one head
        self.sa_head = MultiHeadAttention(num_heads=n_head, head_size=head_size) # since we concat at the end in multihead attention,
                                                                            # the head size for one attention head = n_embd / num_heads
                                                                            # kind of like a group conv in CNN, instead of a large filter we
        self.FFN = FeedForwardNetwork(n_embd=n_embd)
        self.layer_norm1 = nn.LayerNorm(n_embd)
        self.layer_norm2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        x = x + self.sa_head(self.layer_norm1(x)) # implementing skip connection/skip connection
        x = x + self.FFN(self.layer_norm2(x)) # implementing skip connection/skip connection
        
        return x

In [None]:
class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layers)])
        
        self.layer_norm = nn.LayerNorm(n_embd) # the final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
                
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        
    def forward(self, idx, target=None):
        B, T = idx.shape
        
        tok_emb = self.token_embedding_table(idx) #token embeddings, gives output of shape (B, T, C) here C = n_embd = 32
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # gives output of shape (T, C)
        x = tok_emb + pos_emb # (B, T, C) + (T, C) --> (B, T, C)
        x = self.blocks(x)
        x = self.layer_norm(x)
        logits = self.lm_head(x) # output of shape (B, T, vocab_size)
        
        
        # Explanation of B, T, C
        # B- batch dimension
        # T - time dimension (timestep) in this project one character=one timestep
        # C - channel dimension ie, the dimension of the embedding vector
        if target is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # logits of shape B*T, C
#             print(f"Before: {target}")
            target = target.view(B*T)
#             print(f"After: {target}, its shape is {target.shape}")
            loss = F.cross_entropy(logits, target)

        return logits, loss
    
    def generate(self, idx, max_new_tokens): # generates output given idx ie, the starting token
        # idx is (B, T) array of indices
        for _ in range(max_new_tokens):
            idx_trunc = idx[:, -block_size:] # making sure to truncate tokens if we receive more number of tokens than block_size
                                             # just to make sure position embedding doesn't throw any error
            logits, loss = self(idx_trunc) # logits of shape B, T, C
#             print(f"logits shape: {logits.shape}")
#             print(f"logits: {logits}")
            logits = logits[:, -1, :] #logits of shape B, C
#             print(f"logits after filter shape: {logits.shape}")
#             print(f"logits after filter: {logits}")
            probs = F.softmax(logits, dim=-1) # probs of shape
#             print(f"probs shape: {probs.shape}")
#             print(f"probs: {probs}")
            idx_next = torch.multinomial(probs, num_samples=1)
#             print(f"next index shape: {idx_next.shape}")
#             print(f"next index: {idx_next}")
            idx = torch.cat((idx, idx_next), dim=1)
#             print(f"index shape: {idx.shape}")
#             print(f"index tensor: {idx}")
        return idx

# we can also predict the loss:
# -ln(1/65) = 4.17438727, where 65 is the vocab_size

In [None]:
@torch.no_grad()
def estimate_loss(model, train_dl, test_dl):
    out = {}
    model.eval()
    for split in ['train', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            if split == 'train':
                X, y = next(iter(train_dl))
            elif split == 'test':
                X, y = next(iter(test_dl))
            logits, loss = model(X.to(device), y.to(device))
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
def save_checkpoint(model, optimizer, epoch, train_loss, test_loss, filename):
    torch.save({
        'epoch': epoch,
        'train_loss': train_loss,
        'test_loss': test_loss,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, filename)

In [None]:
def train(model, optimizer, epochs, train_dl, test_dl):
    loss_per_epoch = {'train': [], 'test': []}
    if prev_model:
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch_offset = checkpoint['epoch']
    else:
        epoch_offset = 0
        
    for epoch in range(epochs):
        if prev_model and epoch == 0:
            losses = {
                'train': checkpoint['train_loss'],
                'test': checkpoint['test_loss']
            }
        else:
            losses = estimate_loss(model, train_dl, test_dl)
            loss_per_epoch['train'].append(losses['train'])
            loss_per_epoch['test'].append(losses['test'])
            save_checkpoint(model=model, optimizer=optimizer, epoch=epoch_offset+epoch, train_loss=losses['train'], test_loss=losses['test'], filename="/kaggle/working/checkpoint-"+str(epoch_offset+epoch)+".pth.tar")

        print(f"Step {epoch_offset+epoch}: train loss {losses['train']:.4f}, Test loss {losses['test']:.4f}")
            
        for data, target in tqdm(train_dl):
            logits, loss = model(data.to(device), target.to(device))
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()
            
        print(decode(model.generate(torch.zeros((1,1), dtype=torch.long, device=device), max_new_tokens=3000)[0].tolist()))        
            
    return loss_per_epoch

In [None]:
def main():
    train_dl = prepare_dataloader(x_train, batch_size)
    test_dl = prepare_dataloader(x_test, batch_size)
    model = GPTLanguageModel().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    print(f"Total number of parameters: {sum(p.numel() for p in model.parameters())/1e6} M")
    
    epochs = 5
    losses = train(model, optimizer, epochs, train_dl, test_dl)
    plot_graph(losses)
    print(decode(model.generate(torch.zeros((1,1), dtype=torch.long, device=device), max_new_tokens=3000)[0].tolist()))

In [None]:
def plot_graph(losses):
    train_loss = losses['train']
    test_loss = losses['test']
    epochs = range(1, len(train_loss) + 1)

    plt.plot(epochs, train_loss, 'b', label='Train Loss')
    plt.plot(epochs, test_loss, 'r', label='Test Loss')
    plt.title('Loss vs Epoch')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

In [None]:
if __name__ == "__main__":
    main()

In [None]:
print(decode(model.generate(torch.zeros((1,1), dtype=torch.long, device=device), max_new_tokens=3000)[0].tolist()))