In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

#hyperparameters
batch_size = 64
block_size = 256
max_iters  = 5000
eval_intervals = 500
learning_rate=3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters= 200
n_embed = 384
n_heads = 6
n_layer = 6
dropout = 0.2

#head dimension = 384/6 = 64 which is the standard dimension

torch.manual_seed(1337)

with open('input.txt','r',encoding = 'utf-8' ) as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

#mapping character to integer
stoi = { ch:i for i,ch in enumerate(chars) } #look up table from character to integer ch:i
iots = {i:ch for i,ch in enumerate(chars)} # vice versa from above
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([iots[i] for i in l])

#train and test split
data = torch.tensor(encode(text), dtype=torch.long)
n= int (0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

#data loading
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix= torch.randint(len(data)-block_size, (batch_size,))  # 4 numbers randomly generated between 0 and (len(data)-block_size)
    x = torch.stack([data[i:i+block_size] for i in ix]) #stack the 1D data in rows
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device),y.to(device)
    return x, y

@torch.no_grad() #telling pytorch that we are not calling backward in this function
#this makes efficient use of memory as intermediate values don't have to be stored
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            logits , loss = model(X,Y)
            losses[k]=loss.item()
        out[split]= losses.mean()
    model.train
    return out

class Head(nn.Module):
    #for single attention
    def __init__(self,head_size):
        super().__init__()
        self.key=nn.Linear(n_embed,head_size,bias=False)
        self.query=nn.Linear(n_embed,head_size,bias=False)
        self.value=nn.Linear(n_embed,head_size,bias=False)
        self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size))) #to create tril
        self.dropout = nn.Dropout(dropout)
    def forward(self,x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2,-1)*C**-0.5 #to normalize
        wei=wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
        wei =F.softmax(wei,dim=-1) #B,T,T
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(num_heads*head_size, n_embed)
        self.dropout = nn.Dropout(dropout)
    def forward(self,x):
        out =  torch.cat([h(x) for h in self.heads],dim =-1) #concat over the channel dimension
        out = self.projection(out)
        return out


class FeedForward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed,4*n_embed), #per token
            nn.ReLU(),
            nn.Linear(4*n_embed,n_embed), #this is for projection in feed forward network
            nn.Dropout(dropout),
        )
    def forward(self,x):
        return self.net(x)
# we multiplied by 4 in linear part of ffn is based on attention is all you need paper


class Block(nn.Module):
    #communication followed by computation
    def __init__(self, n_embed, n_heads):
        super().__init__()
        head_size = n_embed//n_heads
        self.sa = MultiHeadAttention(n_heads,head_size)
        self.ffwd = FeedForward(n_embed)
        self.ln1= nn.LayerNorm(n_embed)
        self.ln2= nn.LayerNorm(n_embed)
    def forward(self, x):
        x= x + self.sa(self.ln1(x))
        x= x + self.ffwd(self.ln2(x))
        return x



class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)
        self.blocks = nn.Sequential(


            *[Block(n_embed,n_heads = n_heads) for _ in range(n_layer)]
        )
        self.ln_f = nn.LayerNorm(n_embed)
        self.sa_head = MultiHeadAttention(4,n_embed//4) # 4 heads of 8 dimensional self attention
        self.feedforward = FeedForward(n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
    def forward(self,idx,targets=None): #here input is named as idx and target as targets
        B,T = idx.shape
        token_emb = self.token_embedding_table(idx) #B,T,C where B for batch=4 T for time=8 and C for channel=65
        pos_emb = self.position_embedding_table(torch.arange(T,device=device)) #T,C
        x= token_emb +pos_emb #B,T,C
        # x= self.sa_head(x)
        # x= self.feedforward(x) #B,T,C
        x = self.blocks(x)
        x= self.ln_f(x)
        logits=self.lm_head(x) #B,T,vocab_size #decoder
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits= logits.view(B*T,C)
            targets=targets.view(-1) #-1 here means B*T
            loss = F.cross_entropy(logits, targets)
        return logits , loss
    #to generate from model from B,T to B,T+1  B,T+2 and so on
    def generate(self,idx,max_new_tokens):
        #idx is (B,T) array of indices
        for _ in range(max_new_tokens):
            idx_cond = idx[:,-block_size:] #croping the contect that we feed into self so we don't pass more than box size element
            logits, loss = self(idx_cond)
            logits = logits[:,-1,:] #becomes B,C by plucking out last element from time dimension because they are used for future predictions
            probs = F.softmax(logits, dim=-1) #B,C
            idx_next = torch.multinomial(probs, num_samples= 1) #B,1
            idx= torch.cat((idx,idx_next),dim=1) #B,T+1
        return idx
model= BigramLanguageModel()
m = model.to(device)

#create optimizer
optimizer = torch.optim.AdamW(m.parameters(),lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_intervals ==0:
        losses = estimate_loss()
        print(f"step {iter}: training loss{losses['train']:.4f}, val loss{losses['val']:.4f}")
    xb,yb= get_batch('train')
    logits , loss = model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

#generate from model
context = torch.zeros((1,1),dtype = torch.long, device=device)
print(decode(m.generate(context,max_new_tokens=500)[0].tolist())) #0 index to unplug the single batch dimension that exists which gives us time steps i.e 1D array of all indices


step 0: training loss4.3744, val loss4.3799
step 500: training loss1.9200, val loss2.0288
step 1000: training loss1.5380, val loss1.7243
step 1500: training loss1.3891, val loss1.6158
step 2000: training loss1.2988, val loss1.5605
step 2500: training loss1.2205, val loss1.5373
step 3000: training loss1.1525, val loss1.5437
step 3500: training loss1.0768, val loss1.5815
step 4000: training loss1.0071, val loss1.6435
step 4500: training loss0.8900, val loss1.7504


YORK:
I being wind, is not to be senator,
Or to the desire taste such burnight of her
Such thrival, and the head of York in peace
Between times, they nothing else tender come
They livel but a time like higher base lives,
Season'd with childis upon the how year's fall.
But then to be gone with tears
And in thy drep in sovereignt rich hence;
But one post-for those days run with treasons.

ROMEO:
Is it my true Prince, so it is it and true,
Have denied it me state, by trany, tell me,
If he may vow,
