# Creating a Bigram Language Model



## Loading dataset and parameters

In [None]:
import pickle
import torch
import numpy as np

with open("./dataset/meta.pkl", 'rb') as f:
    meta = pickle.load(f)

vocab_size = meta["vocab_size"]
itos= meta["itos"]
stoi= meta["stoi"]


device = 'cpu' #cpu, cuda or mps:0

# redefine batch generation
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
    if split == 'train':
        data = np.memmap("./dataset/train.bin", dtype=np.uint16, mode='r')
    else:
        data = np.memmap("./dataset/val.bin", dtype=np.uint16, mode='r')
    
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    if device == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    else:
        x, y = x.to(device), y.to(device)
    return x, y

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l]) 


xb, yb = get_batch('train')

In [None]:
# Check for arbitrary xb 
xb

## BigramLanguageModel
We add a group on new variables here for the embeddings. Pytorch is taking every input id and embedds in internally into a Lookup table which is 

`B` like batch, the batch_size, which is 4

`T` is time, the chunk lengths 8

`C` is the Channel or the embedding dimension or vocab_size 65

Our embedding table is 65x65. See further information in the [PyTorch Documenation for Embedding](https://docs.pytorch.org/docs/stable/generated/torch.nn.Embedding.html)

Logits however are the values of computation before we send it to the activation function or normalize it

In [None]:
import torch.nn as nn


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (Batch, Time, Channel)
        return logits

m = BigramLanguageModel(vocab_size)
logits = m(xb, yb)
print(logits.shape)

### Adding a loss function

We need to add a `loss` function to in the `forward` method of our code. PyTorch's [Crossentropy Function](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html) is a good match of this, but function is not made for multi dimensional input. Therefore we need to reshape the vectors for calculation.

<div style="text-align:center"><img align="center" src="https://sparrow.dev/wp-content/uploads/2021/03/numpy-reshape.jpg" width="300"/></div><br>

Each tensors in Pytorch has a [view method](https://docs.pytorch.org/docs/stable/generated/torch.Tensor.view.html) for reshaping it. We need to find a way to reshape the logits and targets vectors to one-dimensional vectors

In [None]:
from torch.nn import functional as F


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx)  # (Batch, Time, Channel)

        B, T, C = logits.shape
        logits =  # ⚠️⚠️⚠️⚠️⚠️ Create a view of the vector in form of B*T, C
        targets =   # ⚠️⚠️⚠️⚠️⚠️ Create a view of the vector in form of B*T
        loss = F.cross_entropy(logits, targets)
        return logits, loss


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)


If implemented correctly, the `logits.shape` should be `32x65` and the loss something between 4 and 5 (due to entropy). $-ln(1/65)$ would be $4.17$ if we have no entropy in out dataset.

We can now evaluate the quality of our model, but we want to generate from the model, so we need to implement a generate function. 

In [None]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx)  # (Batch, Time, Channel)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, _ = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs =   # (B, C) # ⚠️⚠️⚠️⚠️⚠️ Apply softmax over all dimensions of "logits"
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)


# generate data from the model
idx = torch.zeros((1,1), dtype=torch.long) #1x1 tensor with 0 aka "\n"
result = m.generate(idx, max_new_tokens=100)[0].tolist()
print(decode(result))

## Training of BigramLanguageModel


In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(1000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if steps % 100 == 0:
        print(loss.item())


In [None]:
# generate data from the model
idx =  ## ⚠️⚠️⚠️⚠️⚠️ Create a 1x1 tensor with 0 aka "\n" (long)
result = m.generate(idx, max_new_tokens=500)[0].tolist()
print(decode(result))