In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 64
batch_size = 128
max_iters = 3000
learning_rate = 3e-4 #3e-3, 3e-4, 1e-3, 1e-4
eval_iters = 100
n_embd = 384
n_head = 8
n_layer = 8 #number of blocks
dropout = 0.2

cuda


In [2]:
chars = ""
with open('wizard_of_oz.txt','r', encoding='utf-8') as f:
    text  = f.read()
    chars = sorted(list(set(text)))

vocab_size = len(chars)
print(text[:200])
print(chars)
print(len(chars))

﻿DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW Y
['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
81


In [3]:
string_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

encoded_hello = encode('hello')
decoded_hello = decode(encoded_hello)
print(decoded_hello)

hello


In [4]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,  1, 47,
        33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26, 49,  0,
         0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,  0,  0,
         1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1, 47, 33,
        50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1, 36, 25,
        38, 28,  1, 39, 30,  1, 39, 50,  9,  1])


In [5]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    #print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x,y

x,y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

inputs:
tensor([[ 1, 72, 76,  ..., 55, 58, 59],
        [60, 65, 58,  ..., 56, 54, 65],
        [ 1, 56, 62,  ..., 72, 64, 58],
        ...,
        [ 3, 32, 58,  ..., 56, 58,  1],
        [67, 73,  1,  ...,  0, 56, 68],
        [59,  1, 73,  ..., 68, 74, 65]], device='cuda:0')
targets:
tensor([[72, 76, 62,  ..., 58, 59, 68],
        [65, 58, 57,  ..., 54, 65, 65],
        [56, 62, 71,  ..., 64, 58, 57],
        ...,
        [32, 58, 71,  ..., 58,  1, 68],
        [73,  1, 61,  ..., 56, 68, 67],
        [ 1, 73, 61,  ..., 74, 65, 57]], device='cuda:0')


In [6]:

x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is', context, 'target is', target)

when input is tensor([80]) target is tensor(28)
when input is tensor([80, 28]) target is tensor(39)
when input is tensor([80, 28, 39]) target is tensor(42)
when input is tensor([80, 28, 39, 42]) target is tensor(39)
when input is tensor([80, 28, 39, 42, 39]) target is tensor(44)
when input is tensor([80, 28, 39, 42, 39, 44]) target is tensor(32)
when input is tensor([80, 28, 39, 42, 39, 44, 32]) target is tensor(49)
when input is tensor([80, 28, 39, 42, 39, 44, 32, 49]) target is tensor(1)
when input is tensor([80, 28, 39, 42, 39, 44, 32, 49,  1]) target is tensor(25)
when input is tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25]) target is tensor(38)
when input is tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38]) target is tensor(28)
when input is tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28]) target is tensor(1)
when input is tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1]) target is tensor(44)
when input is tensor([80, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 

In [7]:
class Head(nn.Module):
    #one head of self attention
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self,x ):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        #Compute attention scores
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 #(B,T,hs) @ (B,hs,T) -> (B,T,T)
        wei = wei.masked_fill(self.tril[:T,:T]==0, float('-inf')) # (B,T,T)
        wei = F.softmax(wei, dim=-1) #(B,T,T)
        wei = self.dropout(wei)
        #perform weighted aggregation of the values
        v = self.value(x) #(B,T,hs)
        out = wei @ v #(B,T,T) @ (B,T,hs) -> (B,T,hs)
        return out    

class MultiHeadAttention(nn.Module):
    # Multiple heads of self attention in parallel
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x ):
        out = torch.cat([h(x) for h in self.heads], dim=-1) #(B, T, F) -> (B, T, [h1, h1, h1, h1, h2, h2, h2, h2, h3, h3, h3, h3])
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    #Simple layer followed by non-linearity
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_embd, 4*n_embd),
                                 nn.ReLU(),
                                 nn.Linear(4*n_embd, n_embd),
                                 nn.Dropout(dropout),
                                )

    def forward(self, x):
        return self.net(x)
    
        
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size) #Self Attention
        self.ffwd = FeedForward(n_embd) #Feed Forward
        self.ln1 = nn.LayerNorm(n_embd) #Layer Norm 1
        self.ln2 = nn.LayerNorm(n_embd) #Layer Norm 2

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x+y)
        y = self.ffwd(x)
        x = self.ln2(x+y)
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_emdedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
               
        B, T = index.shape

        # index and targets are both (B,T) tensor of integers        
        tok_emb = self.token_embedding_table(index) #(B,T,C)
        pos_emb = self.position_emdedding_table(torch.arange(T, device=device))#(T,C)
        x = tok_emb + pos_emb #(B,T,C)
        x = self.blocks(x) #(B,T,C)
        x = self.ln_f(x) #(B,T,C)
        logits = self.lm_head(x) #(B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index, index_next), dim=1)
        return index
model = GPTLanguageModel(vocab_size)
m = model.to(device)

        

In [8]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [9]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f'step: {iter}, train loss: {losses['train']:.3f}, val loss : {losses['train']:.3f}')
    xb, yb = get_batch('train')
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

step: 0, train loss: 4.533, val loss : 4.533
step: 100, train loss: 2.303, val loss : 2.303
step: 200, train loss: 1.927, val loss : 1.927
step: 300, train loss: 1.667, val loss : 1.667
step: 400, train loss: 1.503, val loss : 1.503
step: 500, train loss: 1.388, val loss : 1.388
step: 600, train loss: 1.309, val loss : 1.309
step: 700, train loss: 1.241, val loss : 1.241
step: 800, train loss: 1.177, val loss : 1.177
step: 900, train loss: 1.119, val loss : 1.119
step: 1000, train loss: 1.067, val loss : 1.067
step: 1100, train loss: 1.025, val loss : 1.025
step: 1200, train loss: 0.972, val loss : 0.972
step: 1300, train loss: 0.928, val loss : 0.928
step: 1400, train loss: 0.882, val loss : 0.882
step: 1500, train loss: 0.836, val loss : 0.836
step: 1600, train loss: 0.793, val loss : 0.793
step: 1700, train loss: 0.751, val loss : 0.751
step: 1800, train loss: 0.711, val loss : 0.711
step: 1900, train loss: 0.669, val loss : 0.669
step: 2000, train loss: 0.637, val loss : 0.637
step

In [10]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars=decode(m.generate(context, max_new_tokens=100)[0].tolist())
print(generated_chars)

RuntimeError: The size of tensor a (64) must match the size of tensor b (65) at non-singleton dimension 2