In [1]:
import torch
from torch import nn

In [22]:
# hyper parameters

batch_size = 32 # how many independent sequennces will be processed in parallel
n_emb = 32
inf = 1e9
block_size = 8 # max context length to make predictions
vocab_size = 34 # number unique characters in the training text

max_iters = 5
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_items = 200


torch.manual_seed(1337)


class BiagramLanguageModel(nn.Module):
    def __init__():
        super().__init__()
        # each toekn will directly take logits for the next token from lookup table
        self.token_embedding_table = nn.Enbedding(vocab_size , n_emb)  
        self.pos_embedding_table = nn.Embedding(block_size , n_emb)
        self.sa_head = Head(n_emb)
        self.lm_head = nn.Linear(n_emb , vocab_size)

    def forward(self , idx , target = None):
        B , T = idx.shape
        """idx and targets are both tensors of (B,T) integers"""
        token_emb = self.token_embedding_table(idx)  # B , T , n_emb
        pos_emb = self.pos_embedding_table(torch.arange(T , device = device)) # T , n_emb
        x = token_emb + pos_emb # (B , T , n_emb)
        x = self.sa_head(x) # applies one head of self attention
        logits = lm_head(x) # (B , T , vocab_size)

        if target is None:
            loss = None
        else:
            B , T , C = logits.shape
            logits = logits.view(B*T , C)
            targets = targets.view(B*T)
            loss = torch.nn.functional.cross_entropy(logits , targets)

        return logits , loss

    def generate(self, idx , max_new_tokens):
        """idx in (B,T) array of indices in current context"""
        for _ in range(max_new_tokens):
            # crop to get last block_size tokens
            idx_cond = idx[:,-block_size:] # (B , T )
            # get the predictions
            logits , loss = self(idx_cond) # (B*T , vocab_size)
            # focus only on the last time step
            logits = logits[:,-1,:] # (B , vocab_size)
            # apply softmax to get probabilities
            probs = nn.functional.softmax(logits , dim=-1) # (B , vocab_size)
            # sample to get the next predicted token
            idx_next = torch.multinomial(probs , num_samples = 1) # (B , 1)
            # add the token to idx so that next token can also be generated
            idx = torch.cat((idx , idx_next) , dim = 1) # (B , T + 1)
        return idx
            

class Head(nn.Module):
    """one head of self attention"""

    def __init__(self , head_size):
        super.__init__()
        self.key = nn.Linear(n_emb , head_size , bias  = False)
        self.query = nn.Linear(n_emb , head_size , bias  = False)
        self.value = nn.Linear(n_emb , head_size , bias  = False)
        self.register_buffer('tril' , torch.tril(torch.ones((block_size , block_size))))

    def forward(self , x):
        B , T , C = x.shape
        k = self.key(x)      # (B , T , head_size)
        q = self.query(x)    # (B , T , head_size)
        """compute attention weights (affinities) """
        wei = q @ k.transpose(-2,-1) ** -0.5   # (B , T , T)
        wei = torch.masked_fill(self.tril[:T,:T]==0 , -float(inf))  # (B , T , T)
        wei = torch.nn.functional.softmax(wei , dim = -1)  # (B , T , T)
        v = self.value(x)    # (B , T , head_size)
        out = wei @ v # (B , T , T) @ (B , T , head_size) = (B , T , head_size)
        return out

Training

In [38]:
with open('text.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


n = int(0.9*len(text))

data_train = text[:n]
data_val = text[n:]


def get_batch(split):
    data = data_train if split == "train" else data_val
    idxs = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([encode(data[i:i+block_size]) for i in idxs])
    y = torch.stack([encode(data[i+1:i+block_size+1]) for i in idxs])
    x , y = x.to(device) , y.to(device)
    return x , y
get_batch("train")

TypeError: expected Tensor as element 0 in argument 0, but got list