In [1]:
import math
import torch 
import torch.nn as nn
import torch.nn.functional as F

In [2]:
# tiny corpus
text = ("hello world. \n"
        "hello transformer. \n"
        "attention is all you need. \n"
        "hello attention. \n"
        "transformers generate text. \n")

In [None]:
# build vocab (characters)
chars = sorted(list(set(text)))
enum_chars = {ch:i for i, ch in enumerate(chars)} #to mimic transformers tokenizer, we will need to tokenize subwords later
item_chars = {i:ch for ch,i in enum_chars.items()}
#vocab size from the corpus
vocab_size = len(chars)

In [16]:
def encode(s):
    return torch.tensor([enum_chars[c] for c in s], dtype=torch.long)

In [17]:
def decode(ids):
    return "".join([item_chars[i] for i in ids])

In [18]:
input_data = encode(text)

In [None]:
class TinyGPT(nn.Module):
    def __init__(self, vocab_size, d_model = 64, nhead = 8, num_layers = 2, dim_feedforward = 128, block_size = 128, dropout = 0.1):
        super().__init__()
        self.block_size = block_size
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_embedding = nn.Embedding(block_size, d_model)

        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,dropout=dropout,activation='gelu',batch_first=True)
        self.tr = nn.TransformerEncoder(encoder_layer, num_layers = num_layers)
        self.lm_head = nn.Linear(d_model, vocab_size)

    def forward(self, idx):
        B, T = idx.shape
        if T > self.block_size:
            idx = idx[:, -self.block_size:]
            T = self.block_size

        position_ids = torch.arange(T, device = idx.device).unsqueeze(0).expand(B,T)
        x = self.token_embedding(idx) + self.positional_embedding(position_ids)
        


In [14]:
vocab_size

21