In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
path = kagglehub.dataset_download('shusrith/wikipedia-data-bpe')

print('Data source import complete.')

Downloading from https://www.kaggle.com/api/v1/datasets/download/shusrith/wikipedia-data-bpe?dataset_version_number=2...


 67%|██████▋   | 1.77G/2.65G [01:28<00:44, 21.2MB/s]

In [None]:
!pip install datasets

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer
import numpy as np
from tqdm import tqdm
import os
import math
import time

# Configuration
class Config:
    vocab_size = 50000
    seq_length = 128
    num_heads = 12
    num_layers = 12
    embd_dim = 768
    batch_size = 32
    learning_rate = 6e-4
    epochs = 10
    eval_interval = 10000
    eval_iters = 200
    warmup_iters = 2000
    min_lr = 6e-5
    grad_clip = 1.0
    weight_decay = 0.1
    beta1 = 0.9
    beta2 = 0.95
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    checkpoint_dir = '.'
    dataset_path = f'{path}/train'

In [None]:
os.makedirs(Config.checkpoint_dir, exist_ok=True)

import torch
import torch.nn as nn

class FF(nn.Module):
    def __init__(self, embd_dim):
        super().__init__()
        self.linear1 = nn.Linear(embd_dim, 8 * embd_dim)
        self.linear2 = nn.Linear(8 * embd_dim, embd_dim)
        self.gelu = nn.GELU()

    def forward(self, x):
        return self.linear2(self.gelu(self.linear1(x)))


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, embd_dim):
        super().__init__()
        self.attn = nn.MultiheadAttention(
            embed_dim=embd_dim,
            num_heads=num_heads,
            batch_first=True
        )

    def forward(self, x):
        B, T, _ = x.size()
        attn_mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1).bool()
        attn_output, _ = self.attn(x, x, x, attn_mask=attn_mask)
        return attn_output


class Decode(nn.Module):
    def __init__(self, num_heads, embd_dim):
        super().__init__()
        self.attn = MultiHeadAttention(num_heads, embd_dim)
        self.norm1 = nn.LayerNorm(embd_dim)
        self.norm2 = nn.LayerNorm(embd_dim)
        self.ff = FF(embd_dim)
        self.dropout1 = nn.Dropout(0.2)
        self.dropout2 = nn.Dropout(0.2)

    def forward(self, x):
        x_norm = self.norm1(x)
        x = x + self.dropout1(self.attn(x_norm))
        x_norm = self.norm2(x)
        x = x + self.dropout2(self.ff(x_norm))
        return x


class Decoder(nn.Module):
    def __init__(
        self,
        vocab_size,
        seq_length,
        num_layers,
        num_heads,
        embd_dim,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embd_dim)
        self.pos_embedding = nn.Embedding(seq_length, embd_dim)
        self.layers = nn.ModuleList(
            [Decode(num_heads, embd_dim) for i in range(num_layers)]
        )
        self.norm = nn.LayerNorm(embd_dim)

    def forward(self, x):
        seq_length = x.size(1)
        positions = (
            torch.arange(0, seq_length, device=x.device).unsqueeze(0).expand_as(x)
        )
        x1 = self.embedding(x) + self.pos_embedding(positions)
        for layer in self.layers:
            x1 = layer(x1)
        return self.norm(x1)

class GPT3(nn.Module):
    def __init__(self, vocab_size, seq_length, num_heads, num_layers, embd_dim):
        super().__init__()
        self.dec = Decoder(vocab_size, seq_length, num_heads, num_layers, embd_dim)
        self.out = nn.Linear(embd_dim, vocab_size)
        self.seq_length = seq_length
        self.vocab_size = vocab_size

    def forward(self, x):
        x = self.dec(x)
        x = self.out(x)
        return x

    def generate(self, input_ids, max_length=50, temperature=0.9):
        self.eval()
        output = input_ids.tolist()[0]
        with torch.no_grad():
            for _ in range(max_length):
                input_ids = input_ids.to("cuda")
                logits = self(input_ids)
                logits = logits[:, -1, :] / temperature
                probs = nn.functional.softmax(logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
                output.append(int(next_token[0, 0]))
                input_ids = torch.cat([input_ids[:, 1:], next_token], dim=1)
        self.train()
        return output

In [None]:
class GPTDataset(Dataset):
    def __init__(self, tokenized_data_path, seq_length):
        self.seq_length = seq_length
        self.data = []

        print(f"Loading data from {tokenized_data_path}...")

        # Get all .npz files (with more flexible naming)
        npz_files = [f for f in os.listdir(tokenized_data_path) if f.endswith('.npz')][:10]
        if not npz_files:
            raise ValueError("No .npz files found in directory")

        # Load files in arbitrary order (removed problematic sorting)
        for filename in npz_files:
            filepath = os.path.join(tokenized_data_path, filename)
            try:
                with np.load(filepath) as f:
                    arr = f['batch_arrays']
                    if len(arr) > 0:
                        self.data.append(arr)
                    else:
                        print(f"Skipping empty array in {filename}")
            except Exception as e:
                print(f"Error loading {filename}: {e}")
                continue

        if not self.data:
            raise ValueError("No valid data loaded - all files were empty or corrupted")

        # Concatenate all arrays
        try:
            self.data = np.concatenate(self.data)
            print(f"\nTotal tokens loaded: {len(self.data):,}")
        except ValueError as e:
            raise ValueError(f"Error concatenating arrays: {e}")

        # Calculate available sequences
        self.total_sequences = (len(self.data) - 1) // self.seq_length
        if self.total_sequences <= 0:
            raise ValueError(
                f"Not enough data for seq_length={seq_length}. "
                f"Need ≥{seq_length+1} tokens, got {len(self.data)}"
            )
        print(f"Available sequences: {self.total_sequences:,}\n")

    def __len__(self):
        return self.total_sequences

    def __getitem__(self, idx):
        start = idx * self.seq_length
        end = start + self.seq_length + 1  # +1 for target

        if end > len(self.data):
            raise IndexError(f"Sequence {idx} out of range")

        chunk = self.data[start:end]
        return (
            torch.tensor(chunk[:-1], dtype=torch.long),  # input
            torch.tensor(chunk[1:], dtype=torch.long)   # target
        )

In [None]:
tokenizer = ByteLevelBPETokenizer(
    f"{path}/vocab/vocab.json",
    f"{path}/vocab/merges.txt"
)

# Load dataset
full_dataset = GPTDataset(Config.dataset_path, Config.seq_length)

# Split into train, val, test
train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size

train_dataset, val_dataset = random_split(
    full_dataset, [train_size, val_size]
)

# Create data loaders
train_loader = DataLoader(
    train_dataset, batch_size=Config.batch_size, shuffle=True
)
val_loader = DataLoader(
    val_dataset, batch_size=Config.batch_size, shuffle=False
)

In [None]:
model = GPT3(
    vocab_size=Config.vocab_size,
    seq_length=Config.seq_length,
    num_heads=Config.num_heads,
    num_layers=Config.num_layers,
    embd_dim=Config.embd_dim
).to(Config.device)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

total_params = count_parameters(model)
print(f"Total number of parameters: {total_params:,}")

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=Config.learning_rate,
    weight_decay=Config.weight_decay,
    betas=(Config.beta1, Config.beta2)
)

In [None]:
def get_lr(it):
    if it < Config.warmup_iters:
        return Config.learning_rate * it / Config.warmup_iters
    if it > Config.warmup_iters:
        return max(
            Config.min_lr,
            Config.learning_rate * (0.1 ** ((it - Config.warmup_iters) / (Config.warmup_iters * 10)))
        )
    decay_ratio = (it - Config.warmup_iters) / (Config.warmup_iters * 9)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return Config.min_lr + coeff * (Config.learning_rate - Config.min_lr)

In [None]:
from torch.amp import autocast

def train_step(batch):
    x, y = batch
    x, y = x.to(Config.device), y.to(Config.device)

    with autocast("cuda"):
        logits = model(x)
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        y = y.view(B*T)
        loss = nn.functional.cross_entropy(logits, y)

    with torch.no_grad():
        preds = torch.argmax(logits, dim=1)
        correct = (preds == y).sum().item()
        total = y.size(0)
        accuracy = correct / total

    return loss, accuracy

In [None]:
def eval_step(batch):
    x, y = batch
    x, y = x.to(Config.device), y.to(Config.device)

    logits = model(x)
    B, T, C = logits.shape
    logits = logits.view(B * T, C)
    y = y.view(B * T)

    loss = nn.functional.cross_entropy(logits, y)

    preds = torch.argmax(logits, dim=1)
    correct = (preds == y).sum().item()
    accuracy = correct / y.size(0)

    return loss, accuracy

In [None]:
from tqdm import tqdm

@torch.no_grad()
def evaluate():
    model.eval()
    losses = []
    accuracies = []

    pbar = tqdm(val_loader, desc="Evaluating", leave=False)

    for batch in pbar:
        loss, accuracy = eval_step(batch)
        losses.append(loss.item())
        accuracies.append(accuracy)

        pbar.set_postfix({
            'loss': f"{loss.item():.4f}",
            'accuracy': f"{accuracy:.4f}"
        })

    return np.mean(losses), np.mean(accuracies)

In [None]:
from torch.amp import GradScaler

def train():
    best_val_loss = float('inf')

    scaler = GradScaler("cuda")

    config_dict = {
        k: v for k, v in vars(Config).items()
        if not k.startswith('__') and not callable(v)
    }

    for epoch in range(Config.epochs):
        model.train()
        pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{Config.epochs}")

        for it, batch in enumerate(pbar):
            lr = get_lr(it + epoch * len(train_loader))
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

            loss, accuracy = train_step(batch)

            optimizer.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()

            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), Config.grad_clip)

            scaler.step(optimizer)
            scaler.update()

            pbar.set_postfix({
                'loss': loss.item(),
                'accuracy': accuracy,
                'lr': lr
            })

            if (it + 1) % Config.eval_interval == 0 or it == len(train_loader) - 1:
                val_loss, val_accuracy = evaluate()

                print(f"\nStep {it}:")
                print(f"Train Loss: {loss.item():.4f} | Train Acc: {accuracy:.4f}")
                print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    checkpoint = {
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'epoch': epoch,
                        'best_val_loss': best_val_loss,
                        'config': config_dict,
                    }
                    torch.save(
                        checkpoint,
                        os.path.join(Config.checkpoint_dir, 'best_model.pth'),
                        pickle_protocol=4
                    )
                    print("Model saved!")

                model.train()

        # Save at end of epoch
        epoch_checkpoint = {
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'epoch': epoch,
            'best_val_loss': best_val_loss,
            'config': config_dict,
        }
        torch.save(
            epoch_checkpoint,
            os.path.join(Config.checkpoint_dir, f'epoch_{epoch}.pth'),
            pickle_protocol=4
        )



In [None]:
torch.backends.cuda.enable_flash_sdp(True)

In [None]:
train()

torch.save(model.state_dict(), os.path.join(Config.checkpoint_dir, 'final_model.pth'))

In [None]:

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train')
plt.plot(np.linspace(0, len(train_losses), len(val_losses)), val_losses, label='Validation')
plt.title('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train')
plt.plot(np.linspace(0, len(train_accuracies), len(val_accuracies)), val_accuracies, label='Validation')
plt.title('Accuracy')
plt.legend()

plt.savefig(os.path.join(Config.checkpoint_dir, 'training_curves.png'))
plt.show()