In [None]:
!wget -q https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
import torch

In [None]:
with open('input.txt',"r", encoding = "utf-8") as f:
    text = f.read()

In [None]:
print("length of dataset characters: ", len(text))

In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

In [None]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
def encode(text):
    encode_list = []
    for ch in text:
        encode_list.append(stoi[ch])
    return encode_list

def decode(encoding_list):
    decoded_str = ""
    for ls in encoding_list:
        decoded_str+=itos[ls]
    return decoded_str

encoded = encode("hello how are you?")
print(torch.tensor(encoded))
decoded = decode(encoded)
print(decoded)
print(torch.tensor(encoded).shape, torch.tensor(encoded).dtype)

In [None]:
data = torch.tensor(encode(text), dtype = torch.long)
print(data.shape, data.dtype)
print(data[:100])

In [None]:
#splitting data into train and validation
n = int((0.9)*len(data))
train_data = data[:n]
val_data = data[n:]


In [None]:
block_size = 8 
train_sample = train_data[:block_size+1]

In [None]:
ans_list = train_sample.tolist()
ans_str = decode(ans_list)
print(ans_list)
print(ans_str)

In [None]:
x = train_data[:block_size]  #[18, 47, 56, 57, 58, 1, 15, 47, 58]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} -> the target: {target}")

In [None]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8 

def get_batch(split):
    data = train_data if split=="train" else val_data
    ix = torch.randint(len(data)-block_size, size = (batch_size,))  #idx = [10, 30, 45, 60] random indexes within the boundary 
    x = torch.stack([data[i:i+block_size] for i in ix ])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x, y 
xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

In [None]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context.tolist()} the target: {target}")


In [None]:
print(xb)
print(xb.shape)

In [None]:
vocab_size

In [None]:
print("vocab_size:", vocab_size)
print("train max token:", int(train_data.max()))
print("val max token:", int(val_data.max()))

print("xb max:", int(xb.max()))
print("yb max:", int(yb.max()))


In [None]:
import torch.nn as nn
from torch.nn import functional as F
# torch.manual_seed(1337)
n_embed = 32
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embed=32):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed) #
        self.pos_embedding_table = nn.Embedding(block_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets = None):
        #idx (BS, T)
        B, T = idx.shape
        token = self.token_embedding_table(idx) #B, T, C
        pos = self.pos_embedding_table(torch.arange(T, device="cuda")) #T, C
        x = token+pos #B, T, C + T, C ( broadcasting happens) c = n_embed
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) #as most loss function in torch dimension (N, C) input -> (n, c) target-> (n,) 
            targets = targets.view(B*T) #(N, )
            loss = F.cross_entropy(logits, targets)

        return logits, loss 
    def generate(self, idx, max_new_tokens):

        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    

m = BigramLanguageModel(vocab_size=65)
m = m.to(device="cuda")
logits, loss = m(xb.to(device="cuda"), yb.to(device="cuda"))
print(logits.shape)
print(loss) 


idx = torch.zeros((1,1), dtype= torch.long, device="cuda")
#starting from the 0th character from the vocab 
print(decode(m.generate(idx, 100)[0].tolist()))


In [None]:
batch_size = 32
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
for steps in range(1000):
    xb,yb = get_batch('train')
    # xb -> (bs, T) yb -> (bs, T)

    #evaluate the loss 
    logits, loss  = m(xb, yb) # internally calls forward
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

print(loss.item())

In [None]:
B,T,C = 4, 8, 2
X = torch.randn(B,T, C)
X.shape

In [None]:
tril = torch.tril(torch.ones(T, T)) #
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril==0, float('-inf'))  #mask out the future tokens
wei = F.softmax(wei, dim=-1) #normalize
xbow = wei @ X
print(xbow.shape)

In [None]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16


key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False) 

k = key(x) #B, T, 16
q = query(x) #B, T, 16
wei = q @ k.permute(0, 2, 1) 


tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v

out.shape