<a href="https://colab.research.google.com/github/pranj-mach/Poem_GPT/blob/main/poem_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("charunisa/english-poems-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/charunisa/english-poems-dataset?dataset_version_number=1...


100%|██████████| 31.8k/31.8k [00:00<00:00, 34.2MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/charunisa/english-poems-dataset/versions/1





In [2]:
import os

poems = ""
for filename in os.listdir(path):  # path = downloaded folder
    if filename.endswith(".txt"):  # only read poem files
        with open(os.path.join(path, filename), 'r', encoding='utf-8') as f:
            poems += f.read() + "\n"  # concatenate poem + newline
print(poems)

I have wished a bird would fly away,
And not sing by my house all day;

Have clapped my hands at him from the door
When it seemed as if I could bear no more.

The fault must partly have been in me.
The bird was not to blame for his key.

And of course there must be something wrong
In wanting to silence any song.

He halted in the wind, and — what was that
Far in the maples, pale, but not a ghost?
He stood there bringing March against his thought,
And yet too ready to believe the most.

"Oh, that's the Paradise-in-bloom," I said;
And truly it was fair enough for flowers
had we but in us to assume in march
Such white luxuriance of May for ours.

We stood a moment so in a strange world,
Myself as one his own pretense deceives;
And then I said the truth (and we moved on).
A young beech clinging to its last year's leaves.

The farmhouse lingers, though averse to square
With the new city street it has to wear
A number in. But what about the brook
That held the house as in an elbow-crook?
I a

In [3]:
# Build the vocabulary
chars = sorted(list(set(poems)))
vocab_size = len(chars)
print("All unique characters:", chars)
print("Vocab size:", vocab_size)


All unique characters: ['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '…']
Vocab size: 75


In [4]:
# char-to-index and index-to-char dictionaries
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }


def encode(s):
    return [stoi[c] for c in s]

def decode(l):
    return ''.join([itos[i] for i in l])


In [5]:
import torch

# Encode the entire poem into a tensor of integers
data = torch.tensor(encode(poems), dtype=torch.long)

# Define the block size and batch size
block_size = 128  # number of characters per input sequence
batch_size = 64   # how many sequences per batch

# Split into train and val (90-10)
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]

# Batch function
def get_batch(split="train"):
    source = train_data if split == "train" else val_data
    ix = torch.randint(len(source) - block_size, (batch_size,))
    x = torch.stack([source[i:i+block_size] for i in ix])
    y = torch.stack([source[i+1:i+block_size+1] for i in ix])
    return x, y


In [6]:
x, y = get_batch("train")
print("Input shape:", x.shape)   # (64, 128)
print("Target shape:", y.shape)  # (64, 128)

# Let's peek at one sample
print("Sample input (decoded):")
print(decode(x[0].tolist()))

print("Sample target (decoded):")
print(decode(y[0].tolist()))


Input shape: torch.Size([64, 128])
Target shape: torch.Size([64, 128])
Sample input (decoded):
fountain at my fond heart's door,
Whose only business was to flow;
And flow it did; not taking heed
Of its own bounty, or my nee
Sample target (decoded):
ountain at my fond heart's door,
Whose only business was to flow;
And flow it did; not taking heed
Of its own bounty, or my need


In [7]:
import torch.nn as nn
import torch.nn.functional as F

class TransformerBlock(nn.Module):
    def __init__(self, n_embd, n_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=n_embd, num_heads=n_heads, batch_first=True)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ffwd = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd)
        )
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        attn_out, _ = self.attn(x, x, x, need_weights=False)
        x = x + attn_out
        x = self.ln1(x)
        ffwd_out = self.ffwd(x)
        x = x + ffwd_out
        x = self.ln2(x)
        return x


class PoetryGPT(nn.Module):
    def __init__(self, vocab_size, block_size, n_embd=128, n_heads=4, n_layers=4):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.blocks = nn.ModuleList([TransformerBlock(n_embd, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size)
        self.block_size = block_size

    def forward(self, idx):
        B, T = idx.shape
        tok_emb = self.token_embedding(idx)
        pos = torch.arange(T, device=idx.device)
        pos_emb = self.position_embedding(pos)
        x = tok_emb + pos_emb
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits


In [8]:
model = PoetryGPT(vocab_size=vocab_size, block_size=block_size)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)


In [9]:
max_iters = 1000
eval_interval = 100
loss_fn = nn.CrossEntropyLoss()

for step in range(max_iters):
    model.train()
    xb, yb = get_batch("train")
    xb, yb = xb.to(device), yb.to(device)

    logits = model(xb)
    B, T, C = logits.shape
    loss = loss_fn(logits.view(B*T, C), yb.view(B*T))

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % eval_interval == 0:
        model.eval()
        with torch.no_grad():
            val_xb, val_yb = get_batch("val")
            val_xb, val_yb = val_xb.to(device), val_yb.to(device)
            val_logits = model(val_xb)
            val_loss = loss_fn(val_logits.view(B*T, C), val_yb.view(B*T))
        print(f"Step {step} | Train Loss: {loss.item():.4f} | Val Loss: {val_loss.item():.4f}")


Step 0 | Train Loss: 4.5115 | Val Loss: 3.9082
Step 100 | Train Loss: 2.3519 | Val Loss: 2.4090
Step 200 | Train Loss: 0.0471 | Val Loss: 0.0744
Step 300 | Train Loss: 0.0264 | Val Loss: 0.0342
Step 400 | Train Loss: 0.0219 | Val Loss: 0.0292
Step 500 | Train Loss: 0.0214 | Val Loss: 0.0240
Step 600 | Train Loss: 0.0214 | Val Loss: 0.0234
Step 700 | Train Loss: 0.0188 | Val Loss: 0.0255
Step 800 | Train Loss: 0.0202 | Val Loss: 0.0255
Step 900 | Train Loss: 0.0191 | Val Loss: 0.0270
