<a href="https://colab.research.google.com/github/pranukrish/CMPE297-SpecialTopics/blob/main/Assignment3/NanoGPT_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset

In [None]:
# Load the book dataset from HuggingFace
dataset = load_dataset('bookcorpus', split='train[:5%]')  # Loading only 5% for demonstration
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

In [None]:
# Define a custom dataset
class BookDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        for text in texts:
            tokens = tokenizer(text, truncation=True, max_length=max_length, return_tensors="pt")
            self.inputs.append(tokens.input_ids[:, :-1])
            self.targets.append(tokens.input_ids[:, 1:])

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

In [None]:
# Split data into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [None]:
train_data = BookDataset(train_dataset['text'], tokenizer)
val_data = BookDataset(val_dataset['text'], tokenizer)

In [None]:
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)

In [None]:
# Training loop with evaluation, checkpoints, gradient clipping, and learning rate scheduling
NUM_EPOCHS = 5
CLIP = 0.5

optimizer = AdamW(model.parameters(), lr=LR)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader)*NUM_EPOCHS)

best_val_loss = float('inf')

In [None]:
for epoch in range(NUM_EPOCHS):
    # Training
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, VOCAB_SIZE), targets.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP)
        optimizer.step()
        scheduler.step()
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                val_loss += criterion(outputs.view(-1, VOCAB_SIZE), targets.view(-1)).item()
        val_loss /= len(val_loader)

        # Checkpoint
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')

        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {loss.item()}, Val Loss: {val_loss}")