In [1]:
pip install torchtext einops

Collecting einops
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: einops
Successfully installed einops-0.6.1


### GPT Paper replication
* Notes for myself:

    They used (msa->add->norm) -> (ffc -> add-> norm)

    Pos embeddings are learnable
    
    Didnt find information about teacher forcing but ok

In [2]:
import os
import random
import math

import torch
from torch import nn, optim
import torch.nn.functional as F

from einops import rearrange, reduce, repeat
from torch import einsum

import numpy as np


In [3]:
model_config = dict(
    num_layers=12,
    num_heads=12,
    hidden_dim=768,
    ffc_hidden_dim=3072,
)
ffc_activation = 'GELU'
max_lr = 2.5e-4

### Let's create the model from the scratch

In [4]:
def attn_function(q, k, v, mask=None, attn_dropout=None):

    #q, k, v shape is [b, s, h]
    b_size = q.shape[0]
    seq_len = q.shape[1]
    hidden_dim = q.shape[2]


    scaled_dot_product = einsum('bsh, bvh -> bsv', [q, k])/math.sqrt(hidden_dim)

    if mask:
        scaled_dot_product = scaled_dot_product.masked_fill(mask==False, 1e-9)

    if attn_dropout:
        scaled_dot_product = attn_dropout(scaled_dot_product)

    attn_probs = F.softmax(scaled_dot_product, dim=-1)
    attn_output = einsum('bsv, bvd -> bsd', [attn_probs, v])

    return attn_output, attn_probs

In [5]:
class MSALayer(nn.Module):
    def __init__(self,
                 num_heads,
                 hidden_dim,
                 attn_dropout_p=0.1
                 ):

        assert hidden_dim % num_heads == 0

        super().__init__()
        self.num_heads = num_heads
        self.hidden_dim = hidden_dim
        self.head_dim = self.hidden_dim // self.num_heads
        self.attn_dropout_p = attn_dropout_p

        self.toq = nn.Linear(self.hidden_dim, self.hidden_dim)
        self.tok = nn.Linear(self.hidden_dim, self.hidden_dim)
        self.tov = nn.Linear(self.hidden_dim, self.hidden_dim)
        self.ffc = nn.Linear(self.hidden_dim, self.hidden_dim)

        self.layer_norm = nn.LayerNorm(normalized_shape=self.hidden_dim)
        self.attn_dropout = nn.Dropout(p=self.attn_dropout_p if self.attn_dropout_p else 0)

    def forward(self,
                x,
                mask=None):
        # shape of input is [b_size, seq_len, hidden_dim]
        q = self.toq(x)
        k = self.tok(x)
        v = self.tov(x)

        q = rearrange(q, 'b s (num_heads h) -> (b num_heads) s h', num_heads=self.num_heads)
        k = rearrange(k, 'b s (num_heads h) -> (b num_heads) s h', num_heads=self.num_heads)
        v = rearrange(v, 'b s (num_heads h) -> (b num_heads) s h', num_heads=self.num_heads)

        output, probs = attn_function(q, k, v, mask=mask, attn_dropout=self.attn_dropout)

        output = rearrange(output, '(b num_heads) s h -> b s (num_heads h)', num_heads=self.num_heads)
        output = self.ffc(output)

        output = self.layer_norm(output + x)
        return output, probs


In [11]:
class DecoderLayer(nn.Module):
    def __init__(self,
                 num_heads,
                 hidden_dim,
                 ffc_hidden_dim,
                 attn_dropout_p=0.1,
                 ffc_dropout_p=0.1,
                 ):
        super().__init__()
        self.num_heads = num_heads
        self.hidden_dim = hidden_dim
        self.ffc_hidden_dim = ffc_hidden_dim
        self.attn_dropout_p = attn_dropout_p
        self.ffc_dropout_p = ffc_dropout_p

        self.ffc_layer = nn.Sequential(
            nn.Linear(self.hidden_dim, self.ffc_hidden_dim),
            nn.GELU(),
            nn.Dropout(p=self.ffc_dropout_p),
            nn.Linear(self.ffc_hidden_dim, self.hidden_dim)
        )
        self.ffc_layer_norm = nn.LayerNorm(normalized_shape=self.hidden_dim)

        self.msalayer = MSALayer(self.num_heads,
                                 self.hidden_dim,
                                 self.attn_dropout_p,)

    def forward(self,
                x,
                mask=None):
        res = x
        x, _ = self.msalayer(x, mask=mask)

        return self.ffc_layer_norm(self.ffc_layer(x) + res)

In [25]:
class GPT(nn.Module):
    def __init__(self,
                 vocab_size,
                 num_layers,
                 num_heads,
                 hidden_dim,
                 ffc_hidden_dim,
                 attn_dropout_p=0.1,
                 ffc_dropout_p=0.1,
                 max_seq_len=512,
                 ):
        super().__init__()
        self.vocab_size = vocab_size
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.hidden_dim = hidden_dim
        self.ffc_hidden_dim = ffc_hidden_dim
        self.attn_dropout_p = attn_dropout_p
        self.ffc_dropout_p = ffc_dropout_p
        self.max_seq_len = max_seq_len

        self.decoder_block = nn.ModuleList([DecoderLayer(self.num_heads,
                                                         self.hidden_dim,
                                                         self.ffc_hidden_dim,
                                                         self.attn_dropout_p,
                                                         self.ffc_dropout_p) for _ in range(self.num_layers)])

        self.pos_embeddings = nn.Embedding(self.max_seq_len, self.hidden_dim)
        self.token_embeddings = nn.Embedding(self.vocab_size, self.hidden_dim)

        self.proj_layer = nn.Linear(self.hidden_dim, self.vocab_size)

        self.register_buffer('tril',
                             torch.tril(torch.ones(self.max_seq_len, self.max_seq_len)).bool())
        self.register_buffer('pos_ids',
                             torch.arange(self.max_seq_len))



    def forward(self,
                input_tokens,
                tokenizer_mask=None):
        seq_len = input_tokens.shape[-1]
        b_size = input_tokens.shape[0]

        mask = self.make_attn_mask(seq_len, b_size, tokenizer_mask)

        x = self.pos_embeddings(self.pos_ids[:seq_len]) + self.token_embeddings(input_tokens)

        for layer in self.decoder_block:
            x = layer(x)
        x = self.proj_layer(x)
        return x


    def make_attn_mask(self, seq_len, b_size, tokenizer_mask=None):
        mask = self.tril[:seq_len, :seq_len].unsqueeze(0).repeat(b_size, 1, 1)

        if tokenizer_mask is not None:
            mask = mask & tokenizer_mask.bool().unsqueeze(1)
        return mask

In [26]:
toy_model = GPT(vocab_size=10,
                num_layers=2,
                num_heads=4,
                hidden_dim=100,
                ffc_hidden_dim=400,
                attn_dropout_p=0.1,
                ffc_dropout_p=0.1,
                max_seq_len=512,)

!pip install torchinfo



### Notes on training
* During training we drop last logit, and drop first index of gt_labels

### Dataset prep

In [8]:
%%writefile get_data.sh
mkdir data
cd data
wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

Overwriting get_data.sh


In [9]:
!sh get_data.sh

mkdir: cannot create directory ‘data’: File exists
--2023-07-20 14:06:41--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2023-07-20 14:06:41 (81.1 MB/s) - ‘input.txt.2’ saved [1115394/1115394]



In [10]:
!pip install tokenizers



In [11]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

tokenizer = Tokenizer(BPE(unk_token="<unk>"))
trainer = BpeTrainer(special_tokens=["<unk>",
                                     "<s>",
                                     "<pad>",
                                     "<bos>",
                                     ], vocab_size=10000) #i took 10k just randomly
tokenizer.pre_tokenizer = Whitespace()

files = ["data/input.txt"]
tokenizer.train(files, trainer)

In [12]:
tokenizer.post_processor = TemplateProcessing(
    single="<bos> $A <s>",
    special_tokens=[
        ("<s>", tokenizer.token_to_id("<s>")),
        ("<bos>", tokenizer.token_to_id("<bos>")),
        ("<pad>", tokenizer.token_to_id("<pad>"))
    ],
)
tokenizer.enable_padding(pad_id=2, pad_token="<pad>")
tokenizer.save("data/tokenizer.json")

In [13]:
tokenizer.encode_batch(['Hello I am john Cena', 'Are you?'])[0].tokens

['<bos>', 'He', 'llo', 'I', 'am', 'jo', 'hn', 'C', 'en', 'a', '<s>']

### DatasetPreparation

In [14]:
from torch.utils.data import Dataset, DataLoader

with open('data/input.txt', 'r') as f:
    input_text = f.readlines()
input_text = [i for i in input_text if i!='\n']

['First Citizen:\n',
 'Before we proceed any further, hear me speak.\n',
 '\n',
 'All:\n',
 'Speak, speak.\n']

In [15]:
input_text = [i for i in input_text if i!='\n']
print(len(input_text))
input_text[:5]

32777


['First Citizen:\n',
 'Before we proceed any further, hear me speak.\n',
 'All:\n',
 'Speak, speak.\n',
 'First Citizen:\n']

In [16]:
from typing import List
class ConstantLenghtDataset(Dataset):
    def __init__(self,
                 texts: List[str],
                 tokenizer: Tokenizer,
                 length: int=512,):
        self.texts = texts
        self.length = length
        self.tokenizer = tokenizer
        self.tokenizer.no_padding()

        encoded_text = tokenizer.encode_batch(self.texts)
        tokens_num = [len(s.tokens) for s in encoded_text]
        constant_len_dataset_ids = []
        concat_sentences_ids = []
        sum=0

        for idx, num in enumerate(tokens_num):
            if sum > 512:
                constant_len_dataset_ids.append(concat_sentences_ids)
                concat_sentences_ids = []
                sum = 0

            concat_sentences_ids.append(idx)
            sum+=num

        np_text = np.array(self.texts)
        new_dataset = []
        for idxs in constant_len_dataset_ids:
            new_dataset.append(' '.join(np_text[idxs].tolist()))

        self.dataset = new_dataset

    def __len__(self,):
        return len(self.dataset)

    def __getitem__(self, idx):
        return self.dataset[idx]

In [34]:
class TokenizerWrapper():
    def __init__(self,
                 tokenizer,
                 pad_seq_len=512):
        self.tokenizer = tokenizer
        self.tokenizer.enable_padding(pad_id=2, pad_token="<pad>", length=pad_seq_len)
        self.vocab_size = self.tokenizer.get_vocab_size()

    def __call__(self, input_sentences: List[str], batch=True):
        output = {}
        if batch:
            encoded_input = self.tokenizer.encode_batch(input_sentences)
            ids = torch.tensor([input.ids for input in encoded_input], requires_grad=False)
            attn_masks = torch.tensor([input.attention_mask for input in encoded_input], requires_grad=False)
        else:
            encoded_input = self.tokenizer.encode(input_sentences)
            ids = torch.tensor(encoded_input.ids, requires_grad=False).unsqueeze(0)
            attn_masks = torch.tensor(encoded_input.attention_mask, requires_grad=False).unsqueeze(0)

        output['input_ids'] = ids
        output['attn_mask'] = attn_masks

        return output

In [18]:
train_size = 0.9
train_ids = int(len(input_text) * train_size)
train_data = input_text[: train_ids]
test_data = input_text[train_ids:]

In [19]:
train_dataset = ConstantLenghtDataset(train_data, tokenizer, length=512)
test_dataset = ConstantLenghtDataset(test_data, tokenizer, length=512)

### Notes:
* Write trainer, which will initialize model, setup tokenizer (pad_len and stuff)
* In training remember to take logits up to -1, and preds from 1

### Simple training, just to check how model trains/performs

In [49]:
tokenizerwrapped = TokenizerWrapper(tokenizer, 512)

In [47]:


train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, drop_last=False)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False, drop_last=False)


model_config = dict(
    num_layers=12,
    num_heads=12,
    hidden_dim=768,
    ffc_hidden_dim=3072,
)
ffc_activation = 'GELU'
model = GPT(**model_config, vocab_size=tokenizerwrapped.vocab_size)
model = model.cuda()
print(torch.cuda.memory_cached()/1e9)

5.895094272




In [50]:
from tqdm.auto import tqdm
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=2e-4)

num_epochs = 10

for epoch in tqdm(range(num_epochs)):
    model.train()
    training_loss = 0
    for batch_num, batch in enumerate(train_dataloader):
        optimizer.zero_grad()

        inputs = tokenizerwrapped(batch)
        labels = inputs['input_ids'].cuda()
        logits = model(inputs['input_ids'].cuda(), inputs['attn_mask'].cuda())

        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        loss.backward()
        optimizer.step()

        training_loss += loss.item()

    training_loss /= batch_num
    print(f"Epoch: {epoch}, Training loss: {training_loss}")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 0, Training loss: 9.273114935992515
Epoch: 1, Training loss: 8.682613830043845
Epoch: 2, Training loss: 8.15801719116838
Epoch: 3, Training loss: 7.858718414829202
Epoch: 4, Training loss: 7.749296305930778
Epoch: 5, Training loss: 7.693889892264588
Epoch: 6, Training loss: 7.6517268141655075
Epoch: 7, Training loss: 7.613897055795748
Epoch: 8, Training loss: 7.57659795186291
Epoch: 9, Training loss: 7.538805537027855


In [51]:
model(batch['input_ids'].cuda(), attn_mask.cuda())

TypeError: ignored

In [68]:
prefix = "<bos>"  # same as above
tokenizerwrapped = TokenizerWrapper(tokenizer, 0)
batch = tokenizerwrapped(prefix, batch=False)
past_key_values = None

num_generations = 2000
with torch.cuda.amp.autocast():
  for i in range(num_generations):

    attn_mask = batch['attn_mask']
    outputs = model(batch['input_ids'].cuda(), attn_mask.cuda())
    probs = outputs[0, -1].div(0.8).softmax(-1)
    token = torch.multinomial(probs, 1).view([])

    print(tokenizerwrapped.tokenizer.decode([token]), end=' ', flush=True)
    batch = dict(input_ids=outputs[0, -1].argmax(-1).reshape(1, 1),
                 attn_mask=torch.ones(num_generations, requires_grad=False).cuda()[:i+1])

quarters                                                                                                                                     rain                                                                                                                          ival                                                                                                                                                                                                                                                                                                                                                                                               Mars                                                                                                                                                                                                                                                                                                                                                      

In [39]:
prefix = "<bos>"  # same as above
tokenizerwrapped = TokenizerWrapper(tokenizer, 0)
batch = tokenizerwrapped(prefix, batch=False)

In [40]:
batch

{'input_ids': tensor([[3, 3, 1]]), 'attn_mask': tensor([[1, 1, 1]])}