In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
with open('input.txt') as file:
    text = file.read()

In [4]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)
print(chars)

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
char_to_index = {i:j for j,i in enumerate(chars)}
index_to_char = {i:j for i,j in enumerate(chars)}
encode = lambda s:[char_to_index[i] for i in s]
decode = lambda s:''.join([index_to_char[i] for i in s])

In [7]:
encode('siva sankar')

[57, 47, 60, 39, 1, 57, 39, 52, 49, 39, 56]

In [8]:
decode([57, 47, 60, 39, 1, 57, 39, 52, 49, 39, 56])

'siva sankar'

In [186]:
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000 #training max weight updates
eval_interval = 100 #evalution after every 
learning_rate = 1e-3 #learing rate
device = 'cuda' if torch.cuda.is_available() else 'cpu' #device
device = 'cpu'
n_embd = 64 # w2v or embedding vector size
n_heads = 4 # multihead_attention no.of head
blocks = 4  # no.of multihead_attention blocks
dropout = 0.0 #drop out
head_size = n_embd // n_heads # q,k,v matrix output length

In [187]:
class Initial_embedding(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,n_embd) #embedding layer

    def forward(self,idx):
        # here idx (B,T)
        x = self.embedding(idx)  # (B,T,n_embd)
        return x

In [196]:
class Position_embedding(nn.Module):
    def __init__(self):
        super().__init__()
        self.position = nn.Embedding(block_size,n_embd)
    def forward(self,idx):
        # here  
        B,T,C = idx.shape
        idx += self.position(torch.arange(T,device=device)) #(B,T,n_embd) + (T,n_emnd) = (B,T,n_embd)
        return idx

In [197]:
class HeadAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.query = nn.Linear(n_embd,head_size,bias=False)
        self.key = nn.Linear(n_embd,head_size,bias=False)
        self.values = nn.Linear(n_embd,head_size,bias=False)
        self.tril = torch.tril(torch.ones(block_size,block_size)).to(device)

    def forward(self,idx):

        B,T,n_embd = idx.shape
        q = self.query(idx)  ## (B,T,n_embd) * (n_embd,head_size) = (B,T,head_size)
        k = self.key(idx)    ## (B,T,n_embd) * (n_embd,head_size) = (B,T,head_size)
        v = self.values(idx) ## (B,T,n_embd) * (n_embd,head_size) = (B,T,head_size)
        x = q @ k.transpose(-2,-1)/head_size**0.5   # (B,T,head_size) * (B,head_size,T) = (B,T,T)
        x = x.masked_fill(self.tril[:T,:T]==0,float('-inf'))  
        x = F.softmax(x,dim=-1)
        x = x @ v            ## (B,T,T) * (B,T,head_size) = (B,T,head_size)

        return x

In [198]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.heads = nn.ModuleList([HeadAttention() for i in range(n_heads)])
        self.linear = nn.Linear(n_embd,n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        x = self.linear(x)
        x = self.dropout(x)

        return x

In [199]:
class FeedFoward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * head_size),
            nn.ReLU(),
            nn.Linear(4 * head_size, n_embd),
            nn.Dropout(dropout),
        )
    def forward(self,x):
        return self.net(x)

In [200]:
class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.normlayer = nn.LayerNorm(n_embd)
        self.multihead = MultiHeadAttention()
        self.feedforward = FeedFoward()

    def forward(self,x):
        x1 = self.multihead(x)
        x1 += x
        x1 = self.normlayer(x1)
        x1 = self.feedforward(x1)
        x1 += x 
        x1 = self.normlayer(x1)

        return x1

In [201]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = Initial_embedding()
        self.position_embedding = Position_embedding()
        self.blocks = nn.Sequential(*[Block() for i in range(blocks)])
        self.linear = nn.Linear(n_embd,vocab_size)

    def forward(self,x,targets=None):
        x = self.embedding(x)
        x = self.position_embedding(x)
        x = self.blocks(x)
        x = self.linear(x)

        if targets is None:
            loss = None
        else:
            B,T,C = x.shape
            x = x.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(x, targets)

        return x,loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

### train test split

In [22]:
import torch
data = torch.tensor(encode(text))
print(data.shape)
print(data[:100])

torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [23]:
upto = int(len(data)*90/100)
train_data = data[:upto]
valid_data = data[upto:]
train_data.shape,valid_data.shape

(torch.Size([1003854]), torch.Size([111540]))

In [24]:
torch.manual_seed(100) # seed

def get_batch(data):
    data = train_data if data=='train' else valid_data
    start_points = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[start:start+block_size] for start in start_points])
    y = torch.stack([data[start+1:start+block_size+1] for start in start_points])
    return x,y

In [206]:
model = GPT()

model = model.to(device)

In [208]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
    # sample a batch of data
    xb, yb = get_batch('train')


    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if iter % eval_interval == 0:
        xb,yb = get_batch('valid')
        logits,valid_loss = model(xb,yb)
        print(f"iteration {iter} train_loss {loss} valid_loss {valid_loss}")

iteration 0 train_loss 1.6059319972991943 valid_loss 1.8289523124694824
iteration 100 train_loss 1.7760330438613892 valid_loss 1.9158116579055786
iteration 200 train_loss 1.771547794342041 valid_loss 2.128838539123535
iteration 300 train_loss 1.8115777969360352 valid_loss 1.75994074344635
iteration 400 train_loss 1.8020707368850708 valid_loss 1.7162811756134033
iteration 500 train_loss 1.8282545804977417 valid_loss 1.9052622318267822
iteration 600 train_loss 1.7830768823623657 valid_loss 1.8486415147781372
iteration 700 train_loss 1.680596113204956 valid_loss 1.850701928138733
iteration 800 train_loss 1.742267370223999 valid_loss 1.9102754592895508
iteration 900 train_loss 1.7285118103027344 valid_loss 1.9047157764434814
iteration 1000 train_loss 1.6784862279891968 valid_loss 2.0332984924316406
iteration 1100 train_loss 1.673144817352295 valid_loss 1.754084825515747
iteration 1200 train_loss 1.6554232835769653 valid_loss 1.8107984066009521
iteration 1300 train_loss 1.7195836305618286 v

In [224]:
text = 'hello'
text = encode(text)
print(decode(model.generate(torch.tensor([text]),max_new_tokens=1000)[0].tolist()))

hellow see alouder beneing'd,
Not is marrowed noble more so shand all:
On he'lls to and the peopt-hacine contacinen'd, I wicks
Had call very massure to pary him. Would pribled by, thought, whence Genessure armone of youll I begge to from tight,
Before it doth
To the beater; hone, my lord, all thy wring sofder
Wliccent, was out humes; whicher since.

CLAs:
What'derneral it you ending!

DUCHESS:
Erely haven when I'ld throught he burthers,
You stroady, my her, again it inspulcient to die.

ERWIETEN ELIZA:
Farder only  will men may they be trudge,
On Mieck; why then, that consul,
Where are lawly the life bloody.

LEONTES:
O, with there offor, then at himself, I pray
Rever, reyet with daughters to for can me,
And ibsuing, then the's auld towere, then,
Thir rewards wivis fleom my it,
good none will words be Edward well, your shame,
In hoot provent doth the days
Of whas you think husband of lord,
If that the say thank their equling of a corcend the rougness,
Grater would it be goody to with a