# nanoGPT Implementation: Detailed Explanation

## 1. Importing Required Libraries

In [5]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch: The core PyTorch library for deep learning operations.

torch.nn: Contains various neural network building blocks like layers, activation functions, and loss functions.

torch.nn.functional: Provides a functional API for layers and activation functions, often used in models without explicit layer objects.

## 2. Defining Hyperparameters

In [9]:
batch_size = 16  # Number of parallel sequences processed
block_size = 32  # Maximum context length for predictions
max_iters = 5000  # Total training iterations
eval_interval = 100  # Frequency of evaluation during training
learning_rate = 1e-3  # Step size for updating weights
device = 'cuda' if torch.cuda.is_available() else 'cpu'  # Use GPU if available, else CPU
eval_iters = 200  # Number of iterations used for evaluation
n_embd = 64  # Embedding size for token representation
n_head = 4  # Number of attention heads
n_layer = 4  # Number of Transformer blocks
dropout = 0.0  # Dropout rate to prevent overfitting

These hyperparameters control the model's complexity, training efficiency, and optimization strategy.

The device variable ensures that the model runs on GPU if available, which speeds up training.

## 3. Setting Random Seed

In [None]:
torch.manual_seed(1337)


Fixes the random number generator's seed for reproducibility.

Ensures the same results across multiple runs.

## 4. Data Loading: Reading Dataset

In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

Reads the input.txt file (a text dataset).

This dataset is used to train the language model.


## 5. Character-Level Tokenization

In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = { ch:i for i,ch in enumerate(chars) }  # String-to-Index mapping
itos = { i:ch for i,ch in enumerate(chars) }  # Index-to-String mapping

encode = lambda s: [stoi[c] for c in s]  # Converts text to numerical tokens
decode = lambda l: ''.join([itos[i] for i in l])  # Converts tokens back to text

Extracts unique characters in the dataset, forming a vocabulary.

Creates mappings to convert characters to integers (stoi) and vice versa (itos).

Defines encode() and decode() functions to convert text to numerical tokens and back.

## 6. Splitting Dataset into Train & Validation Sets

In [None]:
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))  # 90% for training, 10% for validation
train_data = data[:n]
val_data = data[n:]

The dataset is converted into a PyTorch tensor for efficient processing.

Splits data into 90% training and 10% validation to evaluate generalization.

## 7. Function to Generate Training Batches

In [None]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

Randomly selects sequences of length block_size from the dataset.

Prepares:

    -Inputs (x): Sequence of characters.

    -Targets (y): Next character corresponding to each input.

Moves data to the specified device (CPU/GPU).

## 8. Loss Estimation Function

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

Uses @torch.no_grad() to disable gradient calculation (saves memory).

Computes average loss for both training and validation sets over eval_iters batches.

Switches the model to evaluation mode (model.eval()) and back to training mode (model.train()).

## 9. Transformer Components

### (a) Single Attention Head

In [40]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

Implements a single self-attention head.

key, query, value projections extract different representations of input embeddings.

tril ensures causality (prevents attending to future tokens).

Dropout prevents overfitting.

### (b) Multi-Head Attention

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

Combines multiple attention heads for richer feature extraction.

Concatenates outputs of all attention heads.

Projects the output to match the embedding size.

### (c) Feed Forward Network

In [None]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

Uses two linear layers with ReLU activation.

Expands then compresses feature space (4 * n_embd → n_embd).

### (d) Transformer Block

In [None]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

Implements self-attention followed by feed-forward layers.

Uses Layer Normalization (ln1, ln2) to stabilize training.

## 10. Bigram Language Model

In [None]:
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

Uses token + position embeddings.

Stacks multiple transformer blocks.

Applies LayerNorm before final classification layer.

## 11. Training & Text Generation

In [None]:
for iter in range(max_iters):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

Optimizes using AdamW.

Generates text from the trained model.

## Generate from the model

In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))

### Conclusion
Implements GPT-like text generation with a transformer-based approach.

Uses multi-head attention and feed-forward layers for character-level modeling.

Trains with backpropagation and gradient descent.