<!-- ![image info](./Full_GPT_architecture.png) -->
<img src="./Full_GPT_architecture.png" alt="GPT architecture" style="width:400px;height:400px;"/>

In [None]:
import math

import torch
import torch.nn as nn
from torch.nn import functional as F

import time
from collections import defaultdict
from torch.utils.data.dataloader import DataLoader
from torch.utils.data import Dataset
import json
import numpy as np

In [None]:
torch.cuda.is_available()

True

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

0

In [None]:
config = {
    "embed_dim" : 16,
    "num_heads" : 4,
    "block_size" : 11,
    "attn_pdrop" : 0.1,
    "resid_pdrop" : 0.1,
    "vocab_size" : 7,
    "embd_pdrop" : 0.1,
    "n_layer" : 6,
    "device" : 'cuda',
    "weight_decay" : 0.1,
    "learning_rate" : 3e-4,
    "betas" : (0.9, 0.95),
    "grad_norm_clip" : 1.0,
    "batch_size" : 4
}

In [None]:
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # key, query, value projections for all heads, but in a batch
        self.q_attn = nn.Linear(config["embed_dim"], config["embed_dim"])
        self.k_attn = nn.Linear(config["embed_dim"], config["embed_dim"])
        self.v_attn = nn.Linear(config["embed_dim"], config["embed_dim"])

        # output projection
        self.c_proj = nn.Linear(config["embed_dim"], config["embed_dim"])
        # regularization
        self.resid_dropout = nn.Dropout(config["resid_pdrop"])

        #multi head attention
        self.multihead_attn = nn.MultiheadAttention(config["embed_dim"], config["num_heads"], batch_first=True, dropout=config["attn_pdrop"])

        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("attn_mask", torch.zeros((config["block_size"], config["block_size"]), dtype=torch.bool) \
                                    .masked_fill(torch.tril(torch.ones(config["block_size"], config["block_size"])) \
                                    .view(config["block_size"], config["block_size"]) \
                                    [:config["block_size"],:config["block_size"]] == 0, True))

    def forward(self, x):
        # calculate query, key, values for batch
        q, k ,v  = self.q_attn(x), self.k_attn(x), self.v_attn(x)

        # calculate self attention
        attn_output = self.multihead_attn(query=q, key=k, value=v, attn_mask = self.attn_mask[:x.shape[1], :x.shape[1]])[0]

        # output projection
        y = self.resid_dropout(self.c_proj(attn_output))
        return y

In [None]:
class Block(nn.Module):
    """Transformer block """

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config["embed_dim"])
        self.attn = MultiHeadSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config["embed_dim"])
        self.mlp = nn.ModuleDict(dict(
            c_fc    = nn.Linear(config["embed_dim"], 4 * config["embed_dim"]),
            c_proj  = nn.Linear(4 * config["embed_dim"], config["embed_dim"]),
            act     = nn.GELU(),
            dropout = nn.Dropout(config["resid_pdrop"]),
        ))
        m = self.mlp
        self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x)))) # MLP forward

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlpf(self.ln_2(x))
        return x

In [None]:
class GPT(nn.Module):
    """ GPT Language Model """

    def __init__(self, config):
        super().__init__()
        self.block_size = config["block_size"]

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config["vocab_size"], config["embed_dim"]),
            wpe = nn.Embedding(config["block_size"], config["embed_dim"]),
            drop = nn.Dropout(config["embd_pdrop"]),
            h = nn.ModuleList([Block(config) for _ in range(config["n_layer"])]),
            ln_f = nn.LayerNorm(config["embed_dim"]),
        ))
        self.lm_head = nn.Linear(config["embed_dim"], config["vocab_size"], bias=False)

        # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
        self.apply(self._init_weights)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config["n_layer"]))

        n_params = sum(p.numel() for p in self.transformer.parameters())

        param = self.lm_head.weight.shape
        print("number of parameters: %.2fM" % ((n_params+param[0]*param[1])/1e6))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()

        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
        x = self.transformer.drop(tok_emb + pos_emb)

        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)

        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # either sample from the distribution or take the most likely element
            if do_sample:
                idx_next = torch.multinomial(probs, num_samples=1)
            else:
                _, idx_next = torch.topk(probs, k=1, dim=-1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

    def configure_optimizers(self, config):
        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                # elif pn.endswith('multihead_attn.in_proj_weight'):
                #     decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay

        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == config["n_layer"], "parameters %s were not separated into either decay/no_decay set!" \
                                                    # % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": config["weight_decay"]},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=config["learning_rate"], betas=config["betas"])
        return optimizer

In [None]:
dct = {}
for i in range(100):
    lst = [6 if x>=6 else x for x in [int(np.random.rand()*6) for x in range(6)]]
    lst_sort = sorted(lst)
    dct[i] = {"input": lst + lst_sort[:-1],
              "output": [-1] * 5 + lst_sort}

with open("data.json", "w") as f:
    f.write(json.dumps(dct))

In [None]:
with open("data.json", "r") as f:
    data = json.loads(f.read(), object_hook=lambda d: {int(k) if k.isdigit() else k: v for k, v in d.items()})
data[0]

{'input': [2, 0, 3, 0, 2, 4, 0, 0, 2, 2, 3],
 'output': [-1, -1, -1, -1, -1, 0, 0, 2, 2, 3, 4]}

In [None]:
len(data.keys())

100

In [None]:
lst=[]
for x in sorted(data.keys()):
    lst.append([data[x]['input'], data[x]['output']])
lst_train = lst[:70]
lst_test = lst[70:]

In [None]:
class StockDataset(Dataset):
    def __init__(self, train):
        if train=='train':
            self.data = lst_train
            self.datatype = 'train'
        else:
            self.data = lst_test
            self.datatype = 'test'

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x,y = torch.tensor(self.data[idx][0]), torch.tensor(self.data[idx][1])
        return x, y

In [None]:
train_dataset = StockDataset('train')
test_dataset = StockDataset('test')
train_dataloader = DataLoader(train_dataset,
                            sampler=torch.utils.data.RandomSampler(train_dataset, replacement=True, num_samples=1000),
                            shuffle=False,
                            batch_size=config["batch_size"]
                        )
test_dataloader = DataLoader(test_dataset,
                            sampler=torch.utils.data.RandomSampler(test_dataset, replacement=True, num_samples=200),
                            shuffle=False,
                            batch_size=config["batch_size"]
                        )

In [None]:
def train_epocs(model, optimizer, train_dataloader, epochs):
    for i in range(epochs):
        start_time = time.time()
        model.train()
        idx=0
        sum_loss = 0
        for x, y in train_dataloader:
            x = x.cuda()
            y = y.cuda()
            logits, loss = model(x, y)
            model.zero_grad(set_to_none=True)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config["grad_norm_clip"])
            optimizer.step()
            idx+=len(x)
            sum_loss+= loss
        print("Total time taken for Epoch {} is {} secs and train loss = {}".format(i, int(time.time()-start_time), sum_loss/idx))
    return

In [None]:
model=GPT(config)

number of parameters: 0.03M


In [None]:
model = model.cuda()
optimizer = model.configure_optimizers(config)

In [None]:
train_epocs(model, optimizer, train_dataloader, epochs=20)

Total time taken for Epoch 0 is 8 secs and train loss = 0.3883194923400879
Total time taken for Epoch 1 is 4 secs and train loss = 0.2893466651439667
Total time taken for Epoch 2 is 5 secs and train loss = 0.23119935393333435
Total time taken for Epoch 3 is 4 secs and train loss = 0.19246211647987366
Total time taken for Epoch 4 is 5 secs and train loss = 0.16221240162849426
Total time taken for Epoch 5 is 4 secs and train loss = 0.13805975019931793
Total time taken for Epoch 6 is 4 secs and train loss = 0.11884656548500061
Total time taken for Epoch 7 is 5 secs and train loss = 0.10675780475139618
Total time taken for Epoch 8 is 4 secs and train loss = 0.09984679520130157
Total time taken for Epoch 9 is 5 secs and train loss = 0.08814658969640732
Total time taken for Epoch 10 is 5 secs and train loss = 0.08650403469800949
Total time taken for Epoch 11 is 5 secs and train loss = 0.07953567057847977
Total time taken for Epoch 12 is 5 secs and train loss = 0.07388158142566681
Total time 

In [None]:
PATH = "./sort_GPT.pt"
torch.save(model.state_dict(), PATH)

In [None]:
model=GPT(config)
model = model.cuda()
PATH = "./sort_GPT.pt"
model.load_state_dict(torch.load(PATH))
model.eval()

number of parameters: 0.03M


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(7, 16)
    (wpe): Embedding(11, 16)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ln_1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (attn): MultiHeadSelfAttention(
          (q_attn): Linear(in_features=16, out_features=16, bias=True)
          (k_attn): Linear(in_features=16, out_features=16, bias=True)
          (v_attn): Linear(in_features=16, out_features=16, bias=True)
          (c_proj): Linear(in_features=16, out_features=16, bias=True)
          (resid_dropout): Dropout(p=0.1, inplace=False)
          (multihead_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
          )
        )
        (ln_2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=16, out_features=64, bias=True)
          (c_proj): Linear(in_feature

In [22]:
inp = torch.tensor([[4, 0, 2, 1, 5, 2]], dtype=torch.long).to('cuda')
with torch.no_grad():
    cat = model.generate(inp, 6, do_sample=False, temperature=.1)
print(inp)
print(cat[:,6:])

tensor([[4, 0, 2, 1, 5, 2]], device='cuda:0')
tensor([[0, 1, 2, 2, 4, 5]], device='cuda:0')
