In [80]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import math
import random
from pathlib import Path
import pickle

# Vocabulary Definition

In [81]:
itos = {
    0: "0",
    1: "1",
    2: "2",
    3: "3",
    4: "4",
    5: "5",
    6: "6",
    7: "7",
    8: "8",
    9: "9",
    10: "+",
    11: "=",
    12: "$",
    13: "."
}

stoi = {v: k for k,v in itos.items()}

In [82]:
class AdditionDataset(Dataset):
    def __init__(self, random_seed = 42, max_num_units=6, num_elements = 20000):
        self.random_seed = 42
        self.num_elements = num_elements
        self.max_num_units = max_num_units
        self.max_len = max_num_units + 1 + max_num_units + 1 + (max_num_units+1) + 1
        
        
    def select_random_max_val(self):
        max_val = 1 * 10**self.max_num_units - 1
        
        prob = random.random()
        
        if prob < 0.02:
            max_val = 10
        
        elif prob < 0.06:
            max_val = 1e2
        
        elif prob < 0.11:
            max_val = 1e3
        
        elif prob < 0.16:
            max_val = 1e4
        
        elif prob < 0.20:
            max_val = 1e5
            
        return max_val
            
    
    def __getitem__(self, idx):
        # randomly select two integers, generate the string and return
        a_max_val = self.select_random_max_val()
        b_max_val = self.select_random_max_val()
        
        a = random.randint(0, a_max_val)
        b = random.randint(0, b_max_val)
        
        answer = a + b
        
        equation_string = str(a) + "+" + str(b) + "=" + str(answer)[::-1] + "$"
        while len(equation_string) < self.max_len:
            equation_string += "."
        
        # print(equation_string)
        
        x_list = [stoi[c] for c in equation_string[0:-1]]
        y_list = [stoi[c] for c in equation_string[1:]]
        
        mask_len = len(str(a) + "+" + str(b) + "=") - 1
        
        x,y = torch.tensor(x_list), torch.tensor(y_list)
        y[:mask_len] = -1
        y = torch.where(y == 13,-1, y)
        return x, y
        
    
    def __len__(self):
        return self.num_elements

In [83]:
dataset = AdditionDataset()
dataset.__getitem__(0)

(tensor([ 8,  4,  1,  8,  9, 10,  5,  2, 11,  1,  4,  2,  4,  8, 12, 13, 13, 13,
         13, 13, 13]),
 tensor([-1, -1, -1, -1, -1, -1, -1, -1,  1,  4,  2,  4,  8, 12, -1, -1, -1, -1,
         -1, -1, -1]))

# GPT

In [84]:
class LayerNorm(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.my_weight = nn.Parameter(torch.ones(config.n_embd))
        self.my_bias = nn.Parameter(torch.zeros(config.n_embd)) if config.use_bias else None
        
    
    def forward(self, x):
        return F.layer_norm(x, 
                     normalized_shape=self.my_weight.shape, 
                     weight=self.my_weight, 
                     bias=self.my_bias, 
                     eps=1e-5)


class SelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.use_bias)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.use_bias)
        
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        
        self.head_size = int(config.n_embd / config.n_head) # h_dim
        self.num_head = config.n_head # number of heads
        self.block_size = config.block_size
        
        self.register_buffer("att_mask", torch.triu(float('-inf') * torch.ones(config.block_size, config.block_size), diagonal=1).view(1, 1, self.block_size, self.block_size))
        
        
    def forward(self, x):
        b, t, c = x.size()
        
        q, k, v = self.c_attn(x).split(c, dim=-1) # (b,t,c)
        
        q = q.view(b, t, self.num_head, self.head_size).transpose(2,1) # (b, n_h, t, h_dim)
        k = k.view(b, t, self.num_head, self.head_size).transpose(2,1) # (b, n_h, t, h_dim)
        v = v.view(b, t, self.num_head, self.head_size).transpose(2,1) # (b, n_h, t, h_dim)
        
        att_scores = q @ k.transpose(-1, -2) / math.sqrt(self.head_size) # (b, n_h, t, h_dim) @ (b, n_h, h_dim, t) --> (b, n_h, t, t)
        
        # mask the scores
        att_scores += self.att_mask[:, :, :t, :t]
        
        att_scores = torch.softmax(att_scores, dim=-1) # perform softmax for each element
        
        # perform dropout
        att_scores = self.attn_dropout(att_scores) #(b, n_h, t, t)
        
        out = att_scores @ v # (b, n_h, t, t) @ (b, b_h, t, h_dim) --> (b, n_h, t, h_dim)

        out = out.transpose(1, 2).contiguous() # (b, t, n_h, h_dim)
        out = out.view(b, t, c)
        out = self.resid_dropout(self.c_proj(out))
        
        return out

    
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.up_proj = nn.Linear(config.n_embd, 4*config.n_embd, bias=config.use_bias)
        self.dropout = nn.Dropout(config.dropout)
        self.gelu = nn.GELU()
        self.down_proj = nn.Linear(config.n_embd * 4, config.n_embd, bias=config.use_bias)
        
        
    def forward(self, x):
        out = self.gelu(self.up_proj(x))
        out = self.down_proj(self.dropout(out))
        return out


class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        self.ln_1 = LayerNorm(config)
        self.sa = SelfAttention(config)
        
        self.ln_2 = LayerNorm(config)
        self.mlp = MLP(config)
    
    
    def forward(self, x):
        x = x +  self.sa(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x
        
        
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.block_size, config.n_embd)
        self.drop = nn.Dropout(config.dropout)
        
        self.layers = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
        
        self.ln_f = LayerNorm(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=config.use_bias)
        # self.lm_head.weight = self.wte.weight
    
    
    def forward(self, idx, targets=None):
        # make sure x is on same device
        device = next(self.parameters()).device
        
        idx = idx.to(device)
        
        b, t = idx.size()
        
        pos = torch.arange(0, t, device=device, dtype=torch.long)
        
        tok_emb = self.wte(idx) # (b, t, c)
        pos_emb = self.wpe(pos) # (t,) --> (t, c)
        
        x = tok_emb + pos_emb # (b, t, c)
        
        x = self.drop(x)
                
        for layer in self.layers:
            x = layer(x)
            
        
        logits = self.lm_head(x) # (b,t, c) --> (b, t, vocab_size)
        
        loss = None
        # compute cross entropy loss if targets are provided
        if targets is not None: # targets will be provided in shape b, t
            unrolled_targets = targets.view(b*t)
            unrolled_logits = logits.view(b*t, -1)
            
            loss = F.cross_entropy(unrolled_logits, unrolled_targets, ignore_index=-1)        
                
        return logits, loss

# Training Loop

In [85]:
def training_loop(model, train_loader, eval_loader, optimizer, scheduler, num_epochs=10, eval_every_iter=1000, print_every=200):
    num_iter = 0
    
    training_log = {
        'num_iter': [],
        'training_loss': [],
        'training_acc': [],
        'val_loss': [],
        'val_acc': []
    }
    
    
    device = next(model.parameters()).device
    
    
    for epoch_num in range(num_epochs):
        for i, (x,y) in enumerate(train_loader):
            
            x = x.to(device)
            y = y.to(device)
            
            outputs, loss = model(x, targets=y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if i % print_every == 0:
                print(f"Epoch {epoch_num}, iteration {i}: Training Loss: {loss.item()}")
                
            if num_iter % eval_every_iter == 0:
                
                training_log['num_iter'].append(num_iter)
                training_log['training_loss'].append(loss.item())
                
                val_loss, val_acc = run_eval(model, eval_loader)
                print(f"Epoch {epoch_num}, iteration {i}: Val loss: {val_loss}, val acc: {val_acc}")
                train_acc = calculate_accuracy(outputs, y)
                
                training_log['training_acc'].append(train_acc)
                training_log['val_loss'].append(val_loss)
                training_log['val_acc'].append(val_acc)
                
                save_embeddings(model, num_iter)

            num_iter += 1
        
        scheduler.step()
    
    return training_log

            
def save_embeddings(model, num_iter):
    device = next(model.parameters()).device
    idx = torch.arange(0, 10, device=device)
    embeddings = model.wte(idx)
    
    path = Path("embeddings/")
    if not path.exists():
        path.mkdir()
        
    torch.save(embeddings, f"embeddings/{num_iter}.pt")
    return embeddings
    

def calculate_accuracy(pred, yb):
    # pred shape: (b, t, v)
    _, pred_idx, = pred.max(dim=-1) # (b, t, v) --> (b, t)
    
    correct = 0
    total = pred_idx.size(0)
    
    # yb shape: (b, t)
    for i in range(pred.size(0)):
        pred_row = pred_idx[i].tolist()
        label_row = yb[i].tolist()
        
        
        # truncate all positions in front with -1
        first_index = 0
        for i in range(len(label_row)):
            if label_row[i] != -1:
                first_index = i
                break
        
        stop_index = label_row.index(12)
        
        pred_row = pred_row[first_index: stop_index]
        label_row = label_row[first_index: stop_index]
        
        if pred_row == label_row:
            correct += 1

    return correct / total
    # truncate all positions in end after (and including) $


@torch.no_grad()
def run_eval(model, loader):
    # calculate loss and also the percentage correct!
    total_loss = 0
    total_acc = 0
    num_iter = 0
    
    model.eval()
    
    for i, (x,y) in enumerate(loader):
        device = next(model.parameters()).device
        x = x.to(device)
        y = y.to(device)
        
        out, loss = model(x, targets=y)
        acc = calculate_accuracy(out, y)
        
        total_loss += loss.item()
        total_acc += acc
        num_iter += 1
    
    total_loss /= num_iter
    total_acc /= num_iter
    model.train()
    
    return total_loss, total_acc

In [90]:
class GPTConfig:
    vocab_size = 14
    block_size = 22
    n_embd = 64
    n_head = 8
    n_layer = 6
    use_bias = True
    dropout = 0.1


config = GPTConfig()
model = GPT(config)
model = model.to('cuda')

learning_rate = 1e-3


train_loader = DataLoader(AdditionDataset(), batch_size=32, shuffle=True)
val_loader = DataLoader(AdditionDataset(num_elements=1000), batch_size=32)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.StepLR(optimizer, 10, gamma=0.4)

training_log = training_loop(model, train_loader, val_loader, optimizer, scheduler, eval_every_iter=500, num_epochs=23)

Epoch 0, iteration 0: Training Loss: 3.1260454654693604
Epoch 0, iteration 0: Val loss: 2.7471307441592216, val acc: 0.0
Epoch 0, iteration 200: Training Loss: 1.9223836660385132
Epoch 0, iteration 400: Training Loss: 1.8935703039169312
Epoch 0, iteration 500: Val loss: 1.8774835914373398, val acc: 0.0
Epoch 0, iteration 600: Training Loss: 1.9185903072357178
Epoch 1, iteration 0: Training Loss: 1.8560949563980103
Epoch 1, iteration 200: Training Loss: 1.8319061994552612
Epoch 1, iteration 375: Val loss: 1.6616678461432457, val acc: 0.0
Epoch 1, iteration 400: Training Loss: 1.6441210508346558
Epoch 1, iteration 600: Training Loss: 1.7708826065063477
Epoch 2, iteration 0: Training Loss: 1.605664610862732
Epoch 2, iteration 200: Training Loss: 1.669075608253479
Epoch 2, iteration 250: Val loss: 1.6125296503305435, val acc: 0.0029296875
Epoch 2, iteration 400: Training Loss: 1.6881994009017944
Epoch 2, iteration 600: Training Loss: 1.608519196510315
Epoch 3, iteration 0: Training Loss: 1

In [91]:
torch.save(model.state_dict(), "adder_gpt_state_dict.pth")

In [92]:
xb , yb = next(iter(train_loader))
xb = xb.to('cuda')
yb = yb.to('cuda')


print(xb.shape)
print(yb.shape)

logits, loss = model(xb, targets=yb)
# print(logits.size())
# print(loss)


calculate_accuracy(logits, yb)

torch.Size([32, 21])
torch.Size([32, 21])


1.0

In [93]:
with open('training_log.pkl', 'wb') as f:
    pickle.dump(f)

TypeError: dump() missing required argument 'file' (pos 2)

# Messing around with LayerNorm

In [24]:
a = torch.randn(1, 768)
weight = torch.ones(1, 768)
bias = torch.zeros(1, 768)

answer = F.layer_norm(a, a.size(), weight, bias)

print(f"a.size(): {a.size()}")
print(f"answer.size(): {answer.size()}")

a_avg = a.mean(dim=-1)
a_std = a.std(dim=-1)

print(a_avg)
print(a_std)

b = (a - a_avg) / a_std

print(a[0, 0:10])
print(b[0, 0:10])

a.size(): torch.Size([1, 768])
answer.size(): torch.Size([1, 768])
tensor([-0.0444])
tensor([1.0134])
tensor([-0.6823,  0.9493, -0.2643, -0.6182, -0.0727,  1.3347, -1.3283,  1.2171,
        -1.8012, -0.3760])
tensor([-0.6296,  0.9806, -0.2170, -0.5662, -0.0279,  1.3609, -1.2670,  1.2448,
        -1.7336, -0.3273])


In [49]:
a = torch.randn(2, 768)
weight = torch.ones(768)
bias = torch.zeros(768)

answer = F.layer_norm(a, (a.size(-1),), weight, bias)

print(f"a.size(): {a.size()}")
print(f"answer.size(): {answer.size()}")

a_avg = a.mean(dim=-1, keepdim=True)
a_std = a.std(dim=-1, keepdim=True)


# print(f"a.size(): {a.size()}")
# print(f"a_avg.size(): {a_avg.size()}")

# a_avg = a_avg.view(a_avg.size(0), -1)
# a_std = a_
# print(a_avg.size())

# print(a_std)

b = (a - a_avg.view(a_avg.size(), -1)) / (a_std.view(a_std.size(), -1)**2 + 1e-5)**0.5

print(answer[0, :10], b[0, :10])

# print(a[0, 0:10])
# print(b[0, 0:10])

a.size(): torch.Size([2, 768])
answer.size(): torch.Size([2, 768])
tensor([-0.7745, -0.6437,  1.1870,  0.5638, -2.0744,  0.2115,  0.8607,  1.6832,
         1.0694,  1.6970]) tensor([-0.7740, -0.6433,  1.1862,  0.5634, -2.0730,  0.2114,  0.8601,  1.6821,
         1.0687,  1.6959])
