## GPT FROM SCRATCH

### Dependencies

In [63]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import datasets
import s3fs

### Hyper-parameters

In [64]:
batch_size = 16 
block_size = 32
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embedding = 384
n_heads = 6
n_layers = 6
dropout = 0.2


### Data from S3 bucket

In [123]:
# Create an S3 filesystem object
fs = s3fs.S3FileSystem()

# Specify the S3 path to your text file
s3_path = 's3://gpt-training-data-text-file-bucket/input.txt'

# Read the text file directly
with fs.open(s3_path, 'r', encoding='utf-8') as f:
    text = f.read()

In [124]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



### Train and Validation data split

In [42]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

### Encoding and Decoding

In [73]:
charecters = sorted(list(set(text)))
vocab_size = len(charecters)

# characters to integers mapping
ctoi = {ch:i for i,ch in enumerate(charecters)}

# integers to characters  map
itoc = {i:ch for i,ch in enumerate(charecters)} 

# encoding and decoding
encode = lambda s: [ctoi[c] for c in s]
decode = lambda I: ''.join([itoc[i] for i in I])


### Data loading

In [44]:
torch.manual_seed(1337)

def get_batch_data(Data):
    """
        creating the input and target batchs 
    """
    
    if Data == 'train':
        data = train_data
    else:
        data = val_data
        
        
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    
    return x,y


### Loss Estimation

In [54]:
@torch.no_grad() # this disables the gradient calculation
def estimate_loss():
    output = {}
    model.eval() # starting the model evaluation step
    
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for j in range(eval_iters):
            X,Y = get_batch_data(split)
            logits, loss = model(X, Y)
            losses[j] = loss.item()
        output[split] = losses.mean()
        
    model.train() # continuing the training
    
    return output

### Single-Head Self Attention

In [55]:
class Head(nn.Module):
    """
        Single head self attention
    """
    def __init__(self, head_size):
        super().__init__()
        
        # defining the learned weighted metrics
        self.key = nn.Linear(n_embedding, head_size, bias= False)
        self.value = nn.Linear(n_embedding, head_size, bias=False)
        self.query = nn.Linear(n_embedding, head_size, bias = False)
        
        self.register_buffer('tril', torch.tril(torch.ones(block_size,block_size)))
        
        self.dropout = nn.Dropout(dropout)
        
        
    def forward(self, x):
        """
            Input : (B, T, C) 
            Output : (B, T, head_size)
        """
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        
        weights = q @ k.transpose(-2,-1)*k.shape[-1]**-0.5
        weights = weights.masked_fill(self.tril[:T,:T]==0, float('-inf'))
        weights = F.softmax(weights, dim=-1)
        weights = self.dropout(weights)
        
        v = self.value(x)
        output = weights @ v
        return output

### Multi-Head Self-Attention

In [56]:
class MultiHeadSelfAttention(nn.Module):
    """
        This creates multiple head of self-attention using Head class
    """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        
        # Transformes concatenated output into fixed embedding size 
        self.proj = nn.Linear(head_size*num_heads, n_embedding)      
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        output = torch.cat([head(x) for head in self.heads], dim=-1)
        output = self.dropout(self.proj(output))
        
        return output 

### Feed-Forward Neural Network

In [57]:
class FeedForwardNN(nn.Module):
    """
        Introduce non-linearity and richer representations.
        Transform dimensions to facilitate downstream tasks.
        Act as feature extractors to refine representations.
    """
    def __init__(self, n_embedding):
        super().__init__()
        self.ff_net = nn.Sequential(
            nn.Linear(n_embedding, 4*n_embedding),
            nn.ReLU(),
            nn.Linear(4*n_embedding, n_embedding),
            nn.Dropout(dropout),
        )
        
    def forward(self, x):
        return self.ff_net(x)

### Attention and FF Block

In [58]:
class Block(nn.Module):
    """
        One Masked Multi-attention and Feed Forward NN included in this
    """
    def __init__(self, n_embedding, n_heads):
        super().__init__()
        
        head_size = n_embedding//n_heads
        self.S_A = MultiHeadSelfAttention(n_heads, head_size)
        self.ffnn = FeedForwardNN(n_embedding)
        
        # layer normalization
        self.ln1 = nn.LayerNorm(n_embedding)
        self.ln2 = nn.LayerNorm(n_embedding)
        
    def forward(self, x):
        
        x = x + self.S_A(self.ln1(x))
        x = x + self.ffnn(self.ln2(x))
        
        return x

### GPT Model 

In [66]:
class GPTLangiageModel(nn.Module):
    """
        implementing a decoder only model. Based on 
    """
    
    def __init__(self):
        super().__init__()
        
        self.token_embedding_table = nn.Embedding(vocab_size, n_embedding)
        self.position_embedding_table = nn.Embedding(block_size, n_embedding)
        self.blocks = nn.Sequential(*[Block(n_embedding, n_heads = n_heads) for _ in range(n_layers)])
        self.ln_final = nn.LayerNorm(n_embedding)
        self.lm_head = nn.Linear(n_embedding, vocab_size)
        
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
                
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            
            
    def forward(self, idx, targets=None):
        B,T = idx.shape
        
        token_embd = self.token_embedding_table(idx)
        pos_embd = self.position_embedding_table(torch.arange(T, device=device))
        x = token_embd + pos_embd
        x = self.blocks(x)
        x = self.ln_final(x)
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
            
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim= -1)
            
            idx_next = torch.multinomial(probs, num_samples=1)
            
            idx = torch.cat((idx, idx_next), dim= 1)
            
        return idx

### Training - English

In [97]:
model = GPTLangiageModel()
m = model.to(device)

parameters = sum(p.numel() for p in m.parameters())/1e6
print('no of model parameters :', parameters, 'M parameters')

no of model parameters : 10.702913 M parameters


In [84]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [70]:
for iter in range(max_iters):
    
    if iter% eval_interval == 0 or iter == max_iters - 1:
        print("-------------")
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        
    xb, yb = get_batch_data('train')
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    

-------------
step 0: train loss 4.2084, val loss 4.2108
-------------
step 500: train loss 2.1915, val loss 2.2019
-------------
step 1000: train loss 2.0039, val loss 2.0909
-------------
step 1500: train loss 1.9052, val loss 1.9984
-------------
step 2000: train loss 1.8381, val loss 1.9632
-------------
step 2500: train loss 1.7756, val loss 1.9147
-------------
step 3000: train loss 1.7193, val loss 1.8746
-------------
step 3500: train loss 1.6650, val loss 1.8287
-------------
step 4000: train loss 1.6545, val loss 1.8153
-------------
step 4500: train loss 1.6299, val loss 1.7999
-------------
step 4999: train loss 1.6032, val loss 1.7887


### Text Generation

In [74]:
context = torch.zeros((1,1), dtype = torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))




Like me:
Unlendong prosenave heavence of goose!
But disgn the trum mount, to RiChalt that venteel; give.
Give my melt to Blear sting; and that my but plets,
In thinking made am our head you, the packin drim thee,
Bream comest wright MoreuStremplain sens
And sight up somemen him tworl hands.

KING RICHARD I:
Pehought you will be bein him on-famp, this and to ut
But for at plaid; Good at blanish
As tothing him truff me doats, must thee!
And aliver-thou arabow.

SARNCE:
And shrave pennort; then  greasen and make young thing: he should
Spirist upring aad the true. I why war man in dear, like no
deservisTouch thou too m ciky pland that's all
herd hearl, I pray thy omplice flses to marrion your harb,
And first no, is stukesse fear:
Capest you master time, unwerch thought it
Come alintime frong touch thever brong goodngciust. Romeo Randur's all men deams;'
That I live my lorancely Rome.

KING RICH

PAURENCE:

CLAURENCEN:
I did postirens, uppeM my briem, and show the compunty Cwarenciess him

## Building A GPT Model using Tamil bharathiyaar poems

### Model

In [98]:
class GPTLangiageModel_Tamil(nn.Module):
    """
        implementing a decoder only model. 
    """
    
    def __init__(self):
        super().__init__()
        
        self.token_embedding_table = nn.Embedding(vocab_size, n_embedding)
        self.position_embedding_table = nn.Embedding(block_size, n_embedding)
        self.blocks = nn.Sequential(*[Block(n_embedding, n_heads = n_heads) for _ in range(n_layers)])
        self.ln_final = nn.LayerNorm(n_embedding)
        self.lm_head = nn.Linear(n_embedding, vocab_size)
        
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
                
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            
            
    def forward(self, idx, targets=None):
        B,T = idx.shape
        
        token_embd = self.token_embedding_table(idx)
        pos_embd = self.position_embedding_table(torch.arange(T, device=device))
        x = token_embd + pos_embd
        x = self.blocks(x)
        x = self.ln_final(x)
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
            
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
            
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim= -1)
            
            idx_next = torch.multinomial(probs, num_samples=1)
            
            idx = torch.cat((idx, idx_next), dim= 1)
            
        return idx

### Data

In [106]:
# Create an S3 filesystem object
fs = s3fs.S3FileSystem()

# Specify the S3 path to your text file
s3_path = 's3://gpt-training-data-text-file-bucket/bharathiyaar_poems.txt'

# Read the text file directly
with fs.open(s3_path, 'r', encoding='utf-8') as f:
    text = f.read()

In [114]:
print(text[300000:450000])

ே 
ஓரழகாக விளங்கிடு முள்ளத்தை 
யொப்பதோர் செல்வமுண்டோ ? 




பார்க்கு மரங்களெல்லாம் நந்தலாலா ! - நின்றன் 
பச்சை நிறம் தோன்றுதையே நந்தலாலா ! " 




பாரதியார் சமயம் 
ஐம்பெரும் பூதங்களையும் அசரசர பேதமான யாவையும் 
வகுத்து மறையாதி நூலையும் வகுத்த பரம்பொருள் சைவமுதலான 
அளவில் சமயங்களையும் வகுத்தனர் . மோனஉருவாய் நின்று சமயம் 
களுக்கெல்லாம் மேற்பட்ட சமரச சமயத்தையும் வகுத்தான் என் 




பாரதியார் சமயம் 




43 




பது சான்றோர் கண்ட மெய்யுணர்வாகும் . நம் கவிஞர் பெருமான் 
ஒருவாறு வேதமார்க்கத்தின்பாற்பட்டு அறநெறி தழைத்தோங்கும் 
உயிர்ப்பண்பு உடையவரெனத் தெரிய வருகின்றது . எனினும் , அவ 
ரின் பாடல்களைத் துருவி ஆராயுமிடத்து அறச்சமயமே சமயம் மற் 
றெல்லாம் புறச்சமயங்களே என்று பறைசாற்றுகின்றவராகவும் தெரி 
கின்றது . தொல்காப்பியனார் , வள்ளுவனார் , இளங்கோவடிகள் 
போன்ற நல்லிசைப் புலவர்களின் சமயக் கொள்கைகள் இன்ன 
தென்று அறுதியிட்டுக் கூறமுடியாத நிலைமையில் இன்னும் இருக் 
கின்றனவென்று 


கூறுவது மிகையாகாது . தமிழ்மறை தந்த தவ 
ஞானச் செல்வராகிய வள்ளுவனாரின் குறட்பாக்களில் சிற்சில இடங் 
களில் , தாமரைக் கண்ணான் உலகு எ

### Training

In [119]:
model_tamil = GPTLangiageModel_Tamil()
m_tamil = model_tamil.to(device)

parameters = sum(p.numel() for p in m_tamil.parameters())/1e6
print('no of model parameters :', parameters, 'M parameters')

no of model parameters : 10.702913 M parameters


In [117]:
optimizer = torch.optim.AdamW(model_tamil.parameters(), lr=learning_rate)
for iter in range(max_iters):
    
    if iter% eval_interval == 0 or iter == max_iters - 1:
        print("-------------")
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        
    xb, yb = get_batch_data('train')
    
    logits, loss = model_tamil(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    

-------------
step 0: train loss 4.2533, val loss 4.2556
-------------
step 500: train loss 4.2541, val loss 4.2541
-------------
step 1000: train loss 4.2555, val loss 4.2534
-------------
step 1500: train loss 4.2551, val loss 4.2530
-------------
step 2000: train loss 4.2554, val loss 4.2561
-------------
step 2500: train loss 4.2556, val loss 4.2556
-------------
step 3000: train loss 4.2534, val loss 4.2554
-------------
step 3500: train loss 4.2539, val loss 4.2527
-------------
step 4000: train loss 4.2531, val loss 4.2556
-------------
step 4500: train loss 4.2543, val loss 4.2550
-------------
step 4999: train loss 4.2544, val loss 4.2554


### Text generation

In [121]:
context = torch.zeros((1,1), dtype = torch.long, device=device)
print(decode(m_tamil.generate(context, max_new_tokens=1000)[0].tolist()))