# ThyGPT


A char level transformer model


## Help


## Data loading


In [2]:
# downloading dataset
!curl -o input.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1089k  100 1089k    0     0  1806k      0 --:--:-- --:--:-- --:--:-- 1806k


In [3]:
# Loading dataset
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()


In [4]:
print(f"Length of dataset: {len(text)}")


Length of dataset: 1115394


In [5]:
print(text[:1000])


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [6]:
# creating the vocab
chars = sorted(set("".join(text)))
vocab = len(chars)
print(vocab)
print("".join(chars))


65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [7]:
# Creating a maping from chars to integer
stoi = {s: i for i, s in enumerate(chars)}
itos = {v: k for k, v in stoi.items()}

# Encoder decoder
encode = lambda s: [
    stoi[c] for c in s
]  # takes a string and convert it to a list of number
decode = lambda l: "".join(
    itos[i] for i in l
)  # takes a list of number and convert it to a string
print(encode("hi there"))
print(decode(encode("hi there")))


[46, 47, 1, 58, 46, 43, 56, 43]
hi there


In [8]:
# Encoding the compelete dataset
import torch

data = torch.tensor(encode(text))
print(data.shape, data.dtype)
print(data[:100])


torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [9]:
# spliting dataset into train/val
split_index = int(0.9 * len(data))
train = data[:split_index]
val = data[split_index:]


In [10]:
print(train.shape, val.shape)


torch.Size([1003854]) torch.Size([111540])


In [11]:
block_size = 8  # size of chunk's
train[: block_size + 1]
# here there is quite a lot of information packed together
# 18 is followed by 47 , 18, 47 is followed by 56 .....


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [12]:
X = train[:block_size]
y = train[1 : block_size + 1]

for t in range(block_size):
    context = X[: t + 1]
    target = y[t]
    print(f"When input tensor is {context} then target tensor is: {target}")


When input tensor is tensor([18]) then target tensor is: 47
When input tensor is tensor([18, 47]) then target tensor is: 56
When input tensor is tensor([18, 47, 56]) then target tensor is: 57
When input tensor is tensor([18, 47, 56, 57]) then target tensor is: 58
When input tensor is tensor([18, 47, 56, 57, 58]) then target tensor is: 1
When input tensor is tensor([18, 47, 56, 57, 58,  1]) then target tensor is: 15
When input tensor is tensor([18, 47, 56, 57, 58,  1, 15]) then target tensor is: 47
When input tensor is tensor([18, 47, 56, 57, 58,  1, 15, 47]) then target tensor is: 58


In [13]:
# creating a function to give data in batch
torch.manual_seed(1337)


def get_batch(split: str, batch_size: int = 4, block_size: int = 8):
    data = train if split == "train" else val
    ix = torch.randint(0, len(data) - block_size - 1, (batch_size,))
    X = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return X, y


In [14]:
BATCH_SIZE = 4
BLOCK_SIZE = 8
SEED = 1337


In [15]:
xb, yb = get_batch(split="train", batch_size=BATCH_SIZE, block_size=BLOCK_SIZE)
print("________")
print(f"Our input features: {xb.shape}\n{xb}")  # this is our input to the transformers
print(f"Target Values: {yb.shape}\n{yb}")


________
Our input features: torch.Size([4, 8])
tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])
Target Values: torch.Size([4, 8])
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])


In [16]:
# Creating the bigram language model

import torch
from torch import nn
from torch.nn import functional as F


class BiagramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, target=None):
        logits = self.token_embedding(x)  # b, T, C [Batch, Token, Channel]
        if target is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            target = target.view(B * T)
            loss = F.cross_entropy(logits, target)

        return logits, loss

    # When we are doing the forward pass we are getting the shape as [batch_size, token_size, channels]
    # because what we are doing is getting the values from the lookup table of size[65, 65]
    # For each token, you’re predicting logits over 65 possible tokens.
    # so for ex if our token value is 34 then we will get the complete values along the row at index 34

    def generate(self, idx, max_length):
        for _ in range(max_length):
            # pass the index to the model to get the logits
            # the shape of the logits will be [batch_size, Token_size, Channels]
            # for the first time the shape of idx will be [1,1] which when passed through the
            # forward pass will gives us [1,1,65]

            logits, loss = self(idx)

            # Now we will reshape the logits for calculating the probabilities
            # now the shape of logits will be [batch_size * token_size, channels]

            logits = logits[:, -1, :]
            prob = torch.softmax(logits, dim=-1)  # B*T, C
            new_idx = torch.multinomial(input=prob, num_samples=1)  # B, 1
            idx = torch.cat((idx, new_idx), dim=1)  # B, T + 1
        return idx


torch.manual_seed(SEED)
m = BiagramLanguageModel(vocab_size=vocab)
logits, loss = m(xb, yb)
print(logits.shape)

new_preds = m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_length=100)
print(new_preds.shape)
print(decode(new_preds[0].tolist()))


torch.Size([32, 65])
torch.Size([1, 101])

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [17]:
optimizer = torch.optim.AdamW(params=m.parameters(), lr=1e-3)


In [18]:
# Training the model
BATCH_SIZE = 32
torch.manual_seed(SEED)
for step in range(10000):
    m.train()
    xb, yb = get_batch("train", BATCH_SIZE, block_size)

    logits, loss = m(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(loss.item())


2.4774329662323


In [19]:
print(
    decode(
        m.generate(idx=torch.zeros((1, 1), dtype=torch.long), max_length=500)[
            0
        ].tolist()
    )
)

# All gi-brish



Ope llled ilok d avetiqu oulartouef this ain wircewe ore; my
We k woTiar'll wan, OFFon d y fthen ixpeyere we tus!
msetpecondall alll casove; t bre mit ar. thanouig!

T: at ldaysicome?
NTh haveo and ink! I sul Bed hernong! KRTI E:
Fot d mome garesth oust he ovee perrear hanorerin gew,
Gost;
MEMe thous mace ffean ck trboeds y hadowenarng pongoxisiche s the nd cke fowe fo ES:
S iss o:
DO,
Whas GHUS:
Fofours DOLAGofthiowit wit inenqu anary te selee enges ke tha in spe yor s 's lo st ICRKE:
's
Wheyer


> 💡 NOTE:  
> **When performing operations that involve the dim parameter (like sum), it’s crucial to understand how it affects the tensor dimensions.**  
> **Suppose we have a tensor of shape [2, 2, 2] — that is, 2 matrices, each of shape [2, 2].**  
> **• Summing along dim=0 means adding corresponding elements across the 2 matrices. The batch dimension is reduced, so the result has shape [2, 2].**  
> **• Summing along dim=1 means summing the rows within each matrix. Each matrix’s rows are combined, reducing the row dimension. The resulting shape is [2, 1, 2].**  
> **• Summing along dim=2 means summing the columns within each row of each matrix, reducing the column dimension. The result will have shape [2, 2, 1].**
>
> **In short, dim=n collapses that specific axis by applying the operation along it, reducing its size.**


## Self Attention Mathematical Trick


In [20]:
# Approach 1:
# The main objective is to ensure that the model only attends to **past** and **current** tokens — not future ones.
# For example, in the sequence [1, 2, 3, 4, 5, 6, 7, 8], when predicting token 3,
# the model should only have access to tokens 1 and 2 (not 4–8).

# One simple way to incorporate past information is by averaging the embeddings of previous tokens
# For example:
#   [1]         → [1]
#   [1, 2]      → [(1+2)/2]
#   [1, 2, 3]   → [(1+2+3)/3]
# This allows the model to have a context-aware representation at each position.

B, T, C = 4, 8, 2  # B: batch size, T: sequence length, C: embedding dimension

# Randomly initialize a batch of token embeddings (e.g., from an embedding layer)
bow = torch.randn((B, T, C))  # "bag-of-words" embeddings

# Create an empty tensor to store the averaged representations
xbow = torch.zeros_like(bow)

print("Original Embeddings (bow[0]):")
print(bow[0])

# For each batch and each time step, compute the mean of all previous (and current) embeddings
for b in range(B):  # Loop over each batch
    for t in range(T):  # Loop over each time step
        xprev = bow[b, : t + 1]  # Get all previous + current token embeddings
        xbow[b, t] = torch.mean(xprev, dim=0)  # Average them and assign to xbow

print("Context-aware Averaged Embeddings (xbow[0]):")
print(xbow[0])


Original Embeddings (bow[0]):
tensor([[ 0.7352, -0.9397],
        [ 0.5135,  0.5951],
        [-1.0797, -0.9238],
        [ 1.1126, -0.0741],
        [ 0.1068, -0.0124],
        [-0.6453, -1.7736],
        [ 1.0020,  0.4353],
        [-0.8329,  0.4699]])
Context-aware Averaged Embeddings (xbow[0]):
tensor([[ 0.7352, -0.9397],
        [ 0.6243, -0.1723],
        [ 0.0563, -0.4228],
        [ 0.3204, -0.3356],
        [ 0.2777, -0.2710],
        [ 0.1238, -0.5214],
        [ 0.2493, -0.3847],
        [ 0.1140, -0.2779]])


In [21]:
# Approach 2:
# Create a lower triangular matrix of shape [T, T]
# This ensures that each position only has access to itself and the tokens before it
wei = torch.tril(torch.ones((T, T)))

# Normalize each row so the sum equals 1
# This gives us a simple averaging mask, where past tokens are averaged equally
wei = wei / wei.sum(1, keepdim=True)

# Perform matrix multiplication with the bag-of-words (bow) embeddings
# This applies the weighted average of past token embeddings for each position
xbow_fast = wei @ bow

# Verify that the faster implementation produces the same result as the previous one
print(
    torch.allclose(xbow_fast, xbow)
)  # Should return True if both methods are equivalent


True


In [28]:
# Approach 3:
# In Approach 3 we will be using softmax and converting the 0 in the lower triangular matrix to inf reason being
# right now we are initializing the wei with zeros but what these values are in actual is they are sort of interaction
# strength and in actual they will have some values
# we don't want the future wei to interact with our current token

# Creating a lower triangular matrix of shape[T,T]
# This ensures us that each position has access to position before it not the future
trill = torch.tril(torch.ones((T, T)))

# Creating a wei[INteraction strength matrix] of zeros shape[T,T]
wei = torch.zeros((T, T))

# Masking the wei matrix similar to trill matrix and updating the values of zeros to -inf
# This tell us that absolutely in no case that the present position should interact with future position
wei = wei.masked_fill(trill == 0, float("-inf"))

# Doing softmax to normalize our values in wei
wei = torch.softmax(wei, dim=-1)

# Matrix multiplication to average the each position with its previous position so that
# each position has some level of info about the previous positions
wei = wei @ bow

# Checking
print(torch.allclose(wei, xbow))


True


In [29]:
from torch import nn


In [38]:
# Self Attention (single head)
torch.manual_seed(SEED)

# Batch_size, Time, Channel 
B, T, C = 4, 8, 32

# Input Tensor 
bow = torch.randn((B, T, C))                              # Shape: (4 8, 32) 4 matrixs of size 8(chars) represented by 32 numbers

# Single Head Size 
head_size = 16

# Linear layer having input features 32, and out features 16
key = nn.Linear(C, head_size, bias= False)             
query = nn.Linear(C, head_size, bias= False)
value = nn.Linear(C, head_size, bias=False)
# Passing the input through the linear layer 
k = key(bow)                                             # Shape: (4, 8, 16) 
q = query(bow)                                           # Shape: (4, 8, 16)
v = value(bow)
# Now the wei - which is our matrix that tell us how much one token is related to other is dependend on the input 
wei = k @ q.transpose(-2, -1)                            # Shape (4, 8, 16) @ (4, 16, 8) -> (4, 8, 8)
# The wei matrix now contains informations about how each token in the context is how strongly related to one another 


# Now we need to make sure that each position has access to previous token not the future 
# For this we will mask and normallize the wei 
trill = torch.tril(torch.ones(T, T))

wei = wei.masked_fill(trill == 0, float("-inf"))

# Normalizing using the softmax 
wei = torch.softmax(wei, dim=1)
out = wei @ v                                          # Shape (4, 8, 8) @ (4, 8, 32) -> (4, 8, 32)


