In [1]:
# Download the text
!wget -nc https://huggingface.co/datasets/aaru2330/Mahabharath/resolve/main/Mahabharata.txt

File ‘Mahabharata.txt’ already there; not retrieving.



In [2]:
# read the text file
with open('Mahabharata.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print(f"The length of dataset in characters: {len(text)}")

The length of dataset in characters: 14921047


In [4]:
print(text[:1000])

The Complete Mahabharata in English
The Mahabharata
of
Krishna-Dwaipayana Vyasa

BOOK 1
ADI PARVA
Translated into English Prose from the Original Sanskrit Text by Kisari Mohan Ganguli [1883-1896]
Scanned at sacred-texts.com, 2003. Proofed at Distributed Proofing, Juliet Sutherland, Project Manager. Additional proofing
and formatting at sacred-texts.com, by J. B. Hare.
TRANSLATOR'S PREFACE
The object of a translator should ever be to hold the mirror upto his author. That being so, his chief duty is to represent so far as
practicable the manner in which his author's ideas have been expressed, retaining if possible at the sacrifice of idiom and taste
all the peculiarities of his author's imagery and of language as well. In regard to translations from the Sanskrit, nothing is easier
than to dish up Hindu ideas, so as to make them agreeable to English taste. But the endeavour of the present translator has been
to give in the following pages as literal a rendering as possible of the great wo

# Tokenzing the dataset

In [5]:
# Let observe the vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"#&'(),-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_`abcdefghijklmnopqrstuvwxyz—
84


In [6]:
# Mapping characters into integers for encoding/decoding
# character level tokenizers
stoi = { ch:i for i,ch in enumerate(chars)}
itos = { i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s] # encode: string --> list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decode: list of integers --> string

print(encode("hi arjun"))
print(decode(encode("hi arjun")))

[64, 65, 2, 57, 74, 66, 77, 70]
hi arjun


In [7]:
!pip install -q torch

In [8]:
# Encoding the entire text using pytorch
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type)
print(data[1000:]) # GPT view of the 1000 chars 

  cpu = _conversion_method_template(device=torch.device("cpu"))


torch.Size([14921047]) <built-in method type of Tensor object at 0x7b454070a300>
tensor([74, 67,  2,  ...,  0,  0,  1])


# Training the dataset

In [9]:
# Split the train/validation sets
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

Block Size

In [10]:
# This is the training example length sent to the model 
block_size = 8
train_data[:block_size+1]

tensor([45, 64, 61,  2, 28, 71, 69, 72, 68])

In this single block of 9 characters, we have 8 individual examples packed.

In [11]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when the input is {context}, the target is: {target}")

when the input is tensor([45]), the target is: 64
when the input is tensor([45, 64]), the target is: 61
when the input is tensor([45, 64, 61]), the target is: 2
when the input is tensor([45, 64, 61,  2]), the target is: 28
when the input is tensor([45, 64, 61,  2, 28]), the target is: 71
when the input is tensor([45, 64, 61,  2, 28, 71]), the target is: 69
when the input is tensor([45, 64, 61,  2, 28, 71, 69]), the target is: 72
when the input is tensor([45, 64, 61,  2, 28, 71, 69, 72]), the target is: 68


Batch Sizes for GPU optimization

In [12]:
torch.manual_seed(2026)
batch_size = 4 # independent sequences processed parallely
block_size = 8 # Maximum context length of predicitons

def get_batch(split):
    # generates a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size+1] for i in ix])
    
    return x, y

xb, yb = get_batch('train')
print("Inputs:")
print(xb.shape)
print(xb)
print("Targets:")
print(yb.shape)
print(yb)

print('----------')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, : t+1]
        target = yb[b, t]
        print(f"When the input is: {context.tolist()}, the target is: {target}")



Inputs:
torch.Size([4, 8])
tensor([[65, 70, 60, 77, 68, 63, 61,  2],
        [ 2, 77, 70, 76, 71,  2, 64, 65],
        [79, 57, 74, 74, 65, 71, 74, 75],
        [61, 60, 12,  2, 45, 64, 61,  2]])
Targets:
torch.Size([4, 8])
tensor([[70, 60, 77, 68, 63, 61,  2, 65],
        [77, 70, 76, 71,  2, 64, 65, 69],
        [57, 74, 74, 65, 71, 74, 75,  2],
        [60, 12,  2, 45, 64, 61,  2, 69]])
----------
When the input is: [65], the target is: 70
When the input is: [65, 70], the target is: 60
When the input is: [65, 70, 60], the target is: 77
When the input is: [65, 70, 60, 77], the target is: 68
When the input is: [65, 70, 60, 77, 68], the target is: 63
When the input is: [65, 70, 60, 77, 68, 63], the target is: 61
When the input is: [65, 70, 60, 77, 68, 63, 61], the target is: 2
When the input is: [65, 70, 60, 77, 68, 63, 61, 2], the target is: 65
When the input is: [2], the target is: 77
When the input is: [2, 77], the target is: 70
When the input is: [2, 77, 70], the target is: 76
When

In [13]:
print(xb) # Input to the transformer

tensor([[65, 70, 60, 77, 68, 63, 61,  2],
        [ 2, 77, 70, 76, 71,  2, 64, 65],
        [79, 57, 74, 74, 65, 71, 74, 75],
        [61, 60, 12,  2, 45, 64, 61,  2]])


# Let us start training

###  Defining the neural model

In [14]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BiGramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # Each token directly reads off the logits of the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):

        # idx and targets are both (B, T) tensor of integers
        # Basically we are passing the input idx to get the logits for next token prediction
        logits = self.token_embedding_table(idx) # (B, T, C) = (batch_size, time = block_size, channel = vocab_size)

        if targets is None: # If no targets provided, we are in generation mode
            loss = None
        else:
            B, T, C = logits.shape
            # Reshape the logits and targets to calculate the loss as CrossEntropyLoss expects (N, C) and (N,) shape respectively
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, _ = self(idx)
            
            # focus only on the last time step
            logits = logits[:, -1, :] # (B, C) # last element in time dimension

            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)

            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) # single prediciton in each batch

            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        
        return idx

m = BiGramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=100)[0].tolist())) # [0] to get the first batch element

torch.Size([32, 84])
tensor(4.8717, grad_fn=<NllLossBackward0>)


m`D,h9.i_Tdxa5aC3Bk0eE]3eamQIFrSqZQq`]qtUqSgEokos!CnT0Mom`#,e?zYV#KpL`fF(OFclN-Z"59`4Z\GX,,qs,ds]


### Create optimizer

In [15]:
# Create a pytorch optimizer and train the model
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [16]:
batch_size = 32
for step in range(10000):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if step % 500 == 0:
        print(f"Loss at step {step}: {loss.item()}")

Loss at step 0: 4.889804840087891
Loss at step 500: 4.363897323608398
Loss at step 1000: 3.8658130168914795
Loss at step 1500: 3.575252056121826
Loss at step 2000: 3.1285407543182373
Loss at step 2500: 3.0089097023010254
Loss at step 3000: 2.8657565116882324
Loss at step 3500: 2.661139488220215
Loss at step 4000: 2.683713912963867
Loss at step 4500: 2.607184410095215
Loss at step 5000: 2.4870002269744873
Loss at step 5500: 2.4521849155426025
Loss at step 6000: 2.5520248413085938
Loss at step 6500: 2.455198049545288
Loss at step 7000: 2.404175281524658
Loss at step 7500: 2.4161956310272217
Loss at step 8000: 2.3851940631866455
Loss at step 8500: 2.350325345993042
Loss at step 9000: 2.4114227294921875
Loss at step 9500: 2.56941294670105


In [17]:
idx = torch.zeros((1, 1), dtype=torch.long)
print(decode(m.generate(idx, max_new_tokens=500)[0].tolist())) # [


surof. stioffys I Drtherot wifof wiomeathrsahangoldglan? tthecovatherf. Rale hatr frbuly Kavas."
b6.Federnd-wh s witesawhes ble o is, ald ara, a, s, otmarsecucind. f
be, NqSEComemice by
F"in oupon fowild by-inge.em wibs l ctrthofrende. Saulisscceenthen s ispestheees tid, whmpred ondf wazlllis Thiots Whilorive, r LVites d kend o!"
ine horel-dhaind t\543
K2EGaelle Aplourse. irto o patof
withitond sonin wrn s tr s, tetoleo pr ingn acean by h med pr his o gsshanennd s bonvaifo malgronthean this ty o


# Self-Attention Block
The trick is matrix multiplication

In [18]:
torch.manual_seed(2026)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

In [19]:
# x[b, t] = mean_{i<=t} x[b, i]

xbow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t, C) includes the t-th element
        xbow[b, t] = torch.mean(xprev, dim=0)

In [20]:
x[0]

tensor([[-0.1839,  0.7296],
        [ 0.6242,  0.1225],
        [-0.0314, -0.7410],
        [-1.3574, -0.0220],
        [-0.5450,  0.2193],
        [-1.4955,  0.3433],
        [-0.1916, -0.9030],
        [ 0.9982,  0.0361]])

In [21]:
xbow[0]

tensor([[-0.1839,  0.7296],
        [ 0.2201,  0.4261],
        [ 0.1363,  0.0370],
        [-0.2371,  0.0223],
        [-0.2987,  0.0617],
        [-0.4982,  0.1086],
        [-0.4544, -0.0359],
        [-0.2728, -0.0269]])

In [22]:
# version 2
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(dim=1, keepdim=True)
xbow2 = wei @ x # (#B, T, T) @ (B, T, C) --> (B, T, C)
torch.allclose(xbow, xbow2) 

True

 > Note: Even though wei is (T, T) and x is (B, T, C), python will transform (T, T) --> (B, T, T). Batch matrix multiplication was done in a weighted fashion.

In [23]:
xbow[0], xbow2[0] # Identical batches

(tensor([[-0.1839,  0.7296],
         [ 0.2201,  0.4261],
         [ 0.1363,  0.0370],
         [-0.2371,  0.0223],
         [-0.2987,  0.0617],
         [-0.4982,  0.1086],
         [-0.4544, -0.0359],
         [-0.2728, -0.0269]]),
 tensor([[-0.1839,  0.7296],
         [ 0.2201,  0.4261],
         [ 0.1363,  0.0370],
         [-0.2371,  0.0223],
         [-0.2987,  0.0617],
         [-0.4982,  0.1086],
         [-0.4544, -0.0359],
         [-0.2728, -0.0269]]))

> Using softmax, we do weighted aggregation of lower triangular matrix

In [24]:
# version 3
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x # (#B, T, T) @ (B, T, C) --> (B, T, C)

torch.allclose(xbow, xbow3) 

True

In [25]:
xbow3[0]

tensor([[-0.1839,  0.7296],
        [ 0.2201,  0.4261],
        [ 0.1363,  0.0370],
        [-0.2371,  0.0223],
        [-0.2987,  0.0617],
        [-0.4982,  0.1086],
        [-0.4544, -0.0359],
        [-0.2728, -0.0269]])

Self Attention

Every single token will emit two vectors: key and query vector
1. query: what am i looking for?
2. key: what do i contain?

In [26]:
# version 4: self-attention

torch.manual_seed(2026)
B, T, C = 4, 8, 32 # batch size, time steps, channels
x = torch.randn(B, T, C) # private information to an individual token

# single head performer self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)

# No communication across batches
# compute attention scores
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) --> (B, T, T)

tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x) # (B, T, 16) # value communicated for this x
out = wei @ v # (#B, T, T) @ (B, T, 16) --> (B, T, 16)

out.shape

torch.Size([4, 8, 16])

1. Self-attention: key, query, value come from same source --> x
2. Cross-attention: queries can be from x, key, value can come from encoder block (not from x). Separate source of information for K, Q, V

In encoder-style models, this masking step (`wei = wei.masked_fill(tril == 0, float('-inf'))`) is not used, allowing every token to attend to all other tokens in the sequence.
This enables bidirectional context, where each token representation is computed using both past and future tokens.

In [27]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.8011, 0.1989, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.6295, 0.3089, 0.0616, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4263, 0.4093, 0.0583, 0.1061, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1675, 0.0691, 0.3941, 0.0677, 0.3017, 0.0000, 0.0000, 0.0000],
        [0.1371, 0.1077, 0.0833, 0.1481, 0.2006, 0.3233, 0.0000, 0.0000],
        [0.0495, 0.0313, 0.0128, 0.0485, 0.0687, 0.5069, 0.2822, 0.0000],
        [0.2275, 0.1231, 0.0986, 0.2681, 0.0625, 0.0149, 0.0845, 0.1209]],
       grad_fn=<SelectBackward0>)

### Scaled Dot-Product Attention
$$
\text{Attention}(Q, K, V)
= \text{softmax}\left(\frac{QK^\top}{\sqrt{d_k}}\right)V
$$


d_k is the head size.

In [42]:
k = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
wei = q @ k.transpose(-2, -1) # * head_size**-0.5

In [43]:
print(f"Variance of k                 : {k.var().item():.4f}")
print(f"Variance of q                 : {q.var().item():.4f}")
print(f"Variance of wei               : {wei.var().item():.4f}")
print(f"Variance of wei (adjusted)    : {(wei * head_size**-0.5).var().item():.4f}")


Variance of k                 : 1.0863
Variance of q                 : 0.9044
Variance of wei               : 15.6958
Variance of wei (adjusted)    : 0.9810


This wei is fed into softmax, so it is important that wei is fairly diffused. 
If wei has larger variance, then softmax can converge to One-Hot vectors.

In [59]:
torch.softmax(wei[0][0], dim=-1)

tensor([7.5937e-04, 3.1476e-05, 1.4650e-04, 3.0975e-05, 4.5887e-04, 9.9828e-01,
        1.7238e-04, 1.2519e-04])

In [60]:
torch.softmax(wei[0][0] / head_size**-0.5, dim=-1)

tensor([3.3482e-13, 9.8840e-19, 4.6382e-16, 9.2687e-19, 4.4644e-14, 1.0000e+00,
        8.8904e-16, 2.4736e-16])

### Two ideas help in optimizing learning in deep learning networks
1. Skip-connections (residuals)
2. LayerNorm (similar to BatchMore)

In [61]:
class LayerNorm1d: 

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [63]:
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [64]:
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0000))