In [1]:

### Attention is all you need
### using multi head attention with the feed forward layer

import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt



In [2]:

### Hyper parameters

block_size = 128
batch_size = 200
max_iters   = 5000
eval_interval = 100
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_emb = 256*4
learning_rate = 0.0001
dropout = 0.2
n_layer = 4
n_head = 4
#+-------------------------+
#+-------------------------+

In [3]:

torch.manual_seed(1337)

data_loc = '/1.Abstract.txt'
with open(data_loc, 'r', encoding = 'utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

### Encoder (tokenizer) and Decoder
stoi = {j:i for i,j in enumerate(chars)}
itos = {i:j for i,j in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] ## takes the string outputs the list of integers
decode = lambda l: "".join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype = torch.long)

## split the data in train and val
n = int(len(data)*0.9)
train_data = data[:n]
val_data   = data[:n]

#------------------------


In [4]:

### split data in batches
torch.manual_seed(1337)

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix   = torch.randint(len(data) - block_size, (batch_size,)) ## returns random indices till len(data) - block_size -1
    x    = torch.stack([data[i:i+block_size] for i in ix])
    y    = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y  = x.to(device), y.to(device)
    return x, y



## esimate loss (iterating over multiple batches)
@torch.no_grad() ## context manager --> not to maintain the map
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ One head of self attention """
    def __init__(self, head_size):
        super().__init__()
        self.key   = nn.Linear(n_emb, head_size, bias = False)
        self.query = nn.Linear(n_emb, head_size, bias = False)
        self.value = nn.Linear(n_emb, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) ## according to convention as it is not a parameter

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)    # B,T,C
        q = self.query(x)  # B,T,C
        ## compute attention scores (affinities)
        wei = q @ k.transpose(-2,-1) * C ** -0.5
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf')) # (B,T,T) ## make this a decoder block
        wei = F.softmax(wei, dim = -1) # (B, T, T)
        wei = self.dropout(wei)

        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T,C)

        return out


## multi head attention

class MultiHeadAttention(nn.Module):
    "Multiple Heads of self attention in parallel"

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_emb, n_emb)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out =  torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out


## feed forward layer -- let the model figure out connections identified by the attention mechanism
class FeedForward(nn.Module):
    "a simple linear layer followed by a non linearity"

    def __init__(self, n_emb):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_emb,4* n_emb),
        nn.ReLU(),
        nn.Linear(4*n_emb, n_emb),
        nn.Dropout(dropout),)

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    "Transformer Block: Communication followed by computation"

    def __init__(self, n_emb, n_head):
        # n_emb - number of embedding dimentions, n_head - num of self attention heads
        super().__init__()
        head_size = n_emb//n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwf = FeedForward(n_emb)
        self.ln1 = nn.LayerNorm(n_emb)
        self.ln2 = nn.LayerNorm(n_emb)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwf(self.ln2(x))
        return x

## simple Bigram model

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from the lookup (emb) table
        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)
        self.blocks = nn.Sequential(*[Block(n_emb, n_head = n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_emb)
        self.lm_head1 = nn.Linear(n_emb, n_emb*4)
        self.lm_head2 = nn.Linear(n_emb*4, vocab_size)

    def forward(self, idx, targets = None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensors
        tok_emb = self.token_embedding_table(idx) # logits --> log(counts) ## (B,T,C) this c  == nemb
        pos_emb = self.position_embedding_table(torch.arange(T, device = device)) # (T,C) c == nemb
        x = tok_emb + pos_emb
        x = self.blocks(x) ## apply one head self attention
        x = self.ln_f(x)
        x  = self.lm_head1(x)
        logits  = self.lm_head2(x)  ## (B,T,C) c == vocab size

        if targets == None:
            loss = None

        else:
            B,T,C  = logits.shape
            logits = logits.view(B*T,C) ## Crossentropy takes in B,C
            targets = targets.view(B*T) ## Crossentropy takes in B,C

            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            ## crop idx to the last block size token
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step --> this will give prediction for each time step
            logits = logits[:, -1, :]  # (B, C)
            probs = F.softmax(logits, dim = -1) # (B,C)
            ## sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)

            idx = torch.cat((idx, idx_next), dim = 1) #(B, T +1)

        return idx


model = BigramLanguageModel(vocab_size)
m = model.to(device) ## weights will be moved to GPU




In [5]:


# create a pytorch optimizer
learning_rate = 0.00003

optimizer = torch.optim.AdamW(m.parameters(), lr = learning_rate)

for step in range(max_iters):

    ## every once in a while estimate the loss on train and val sets

    if step % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {step}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # get a mini batch
    xb, yb = get_batch('train')
    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

print(loss.item())


### Generate from the model
context = torch.zeros((1,1), dtype = torch.long, device = device)
print(decode(m.generate(context, max_new_tokens = 1000)[0].tolist()))


step 0: train loss 4.6914, val loss 4.6915
step 100: train loss 2.4588, val loss 2.4599
step 200: train loss 2.3704, val loss 2.3708
step 300: train loss 2.2736, val loss 2.2723
step 400: train loss 2.1444, val loss 2.1442
step 500: train loss 1.9940, val loss 1.9944
step 600: train loss 1.8694, val loss 1.8680
step 700: train loss 1.7767, val loss 1.7744
step 800: train loss 1.7074, val loss 1.7075
step 900: train loss 1.6516, val loss 1.6536
step 1000: train loss 1.6090, val loss 1.6090
step 1100: train loss 1.5683, val loss 1.5686
step 1200: train loss 1.5354, val loss 1.5362
step 1300: train loss 1.5063, val loss 1.5062
step 1400: train loss 1.4812, val loss 1.4817
step 1500: train loss 1.4581, val loss 1.4584
step 1600: train loss 1.4390, val loss 1.4366
step 1700: train loss 1.4159, val loss 1.4162
step 1800: train loss 1.4023, val loss 1.4018
step 1900: train loss 1.3853, val loss 1.3868
step 2000: train loss 1.3692, val loss 1.3680
step 2100: train loss 1.3554, val loss 1.3551


In [6]:
torch.save(m.state_dict(), '/model_state_dict.pth')

In [7]:
model1 = torch.load('/model_state_dict.pth')


  model1 = torch.load('/model_state_dict.pth')


In [11]:
save_path = '/model_state_dict.pth'
torch.save(m.state_dict(), save_path)

In [12]:
model1 = BigramLanguageModel(vocab_size)
m1 = model1.to(device) ## weights will be moved to GPU
loaded_state_dict = torch.load('/model_state_dict.pth', map_location=device)  # Or 'cpu' for CPU
m1.load_state_dict(loaded_state_dict)

  loaded_state_dict = torch.load('/model_state_dict.pth', map_location=device)  # Or 'cpu' for CPU


<All keys matched successfully>

In [14]:
model.eval()
context = torch.zeros((1,1), dtype = torch.long, device = device)
print(decode(m1.generate(context, max_new_tokens = 1000)[0].tolist()))

	
How unto large. Grally better Boshelf Carning France.

As I don't merely the most of people to get several students of the kist what Y Combinator who give you at a YC near? That's when startup, the power must people to work in finding growth half in "Applayers in a companyon by temptable to the main returnes. It knows when you'd usee to that getting to students open to do to get the pretty of how how big doefint bore where.
In Crobabily 29 (1) that practick are says, someone that is and level the bubby unintreditionalishmen. The deadly my bothfor having grades. But area or friced I almost distractions and with variation flie so grew rate the half of rame goals.

[1] Jessica Live, or not real VCs folloo Wozniars

Arpuble. By their leves in Lisp Kone 3warses mone, it's forgoing here. If you don't know world be acaptions, that one crowing that. Once you're topics floop mind themselves — for notice, in having example. And individiagring partners. This everyone else domain order.

I notic