# LLaMA Training on Kaggle

This notebook trains a LLaMA model from scratch on Kaggle's free GPU.

## Setup Instructions

1. Create a new Kaggle notebook
2. Enable GPU: Settings > Accelerator > GPU T4 x2
3. Upload this repository as a dataset or clone from GitHub
4. Run all cells

In [None]:
# Clone repository (if not uploaded as dataset)
!git clone https://github.com/YOUR_USERNAME/llm-from-scratch.git
%cd llm-from-scratch

In [None]:
# Install dependencies
!pip install -q torch torchinfo pytorch-lightning wandb datasets regex tokenizers

In [None]:
# Check GPU availability
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
print(
    f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB"
    if torch.cuda.is_available()
    else "No GPU"
)

## Prepare Tokenizer

In [None]:
from pathlib import Path

from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

from scripts.config import Config

# Configuration
model_type = "llama"
tokenizer_dir = "./tokenizers"
num_samples = 100000

# Load config
config = Config.for_llama()

# Create tokenizer directory
Path(tokenizer_dir).mkdir(parents=True, exist_ok=True)
tokenizer_path = Path(tokenizer_dir) / f"{model_type}_tokenizer.json"

print(f"Preparing tokenizer for {model_type}")
print(f"Dataset: {config.data.dataset_name}")
print(f"Vocab size: {config.data.vocab_size}")

In [None]:
# Load dataset
print("Loading dataset...")
dataset = load_dataset(
    config.data.dataset_name,
    config.data.dataset_config,
    split="train",
    streaming=False,
)

# Limit samples
dataset = dataset.select(range(min(num_samples, len(dataset))))
print(f"Using {len(dataset)} samples for tokenizer training")

In [None]:
# Train tokenizer
def text_iterator():
    for item in dataset:
        yield item[config.data.text_column]


print("Training tokenizer...")
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()

special_tokens = ["<pad>", "<unk>", "<bos>", "<eos>"]
trainer = BpeTrainer(
    vocab_size=config.data.vocab_size,
    special_tokens=special_tokens,
    show_progress=True,
)

tokenizer.train_from_iterator(text_iterator(), trainer=trainer)
tokenizer.save(str(tokenizer_path))
print(f"Tokenizer saved to: {tokenizer_path}")

## Train Model

In [None]:
import os
from datetime import datetime

import pytorch_lightning as L
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger

from scripts.config import Config
from scripts.lightning_module import LlamaLightningModule
from utils.training_pipeline import get_data_module

# Training configuration
run_name = f"llama_kaggle_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
checkpoint_dir = "./checkpoints"
use_wandb = False  # Set to True if you want to use W&B

# Create config
config = Config.for_llama()
config.run_name = run_name
config.checkpoint_dir = checkpoint_dir
config.tokenizer_dir = tokenizer_dir
config.wandb.enabled = use_wandb

# Kaggle-specific optimizations
config.data.batch_size = 16  # Reduce batch size for memory
config.training.num_epochs = 1  # Quick training for demo

# Create directories
Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)

config.validate()

print("Training configuration:")
print(f"  Run name: {run_name}")
print(f"  Batch size: {config.data.batch_size}")
print(f"  Model dim: {config.model.model_dim}")
print(f"  Num layers: {config.model.num_layers}")
print(f"  Max epochs: {config.training.num_epochs}")

In [None]:
# Setup training
os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
torch.set_float32_matmul_precision("high")
torch.backends.cudnn.benchmark = True

# Prepare data
print("Preparing data...")
data_module = get_data_module(config)
data_module.setup(stage="fit")

# Initialize model
print("Initializing model...")
pl_module = LlamaLightningModule(config)

In [None]:
# Setup callbacks
callbacks = [
    EarlyStopping(
        monitor="val_loss",
        patience=config.training.early_stopping_patience,
        mode="min",
        verbose=True,
    ),
    ModelCheckpoint(
        dirpath=f"{config.checkpoint_dir}/{config.run_name}",
        filename="best_model",
        monitor="val_loss",
        mode="min",
        save_top_k=1,
        verbose=True,
    ),
]

# Setup logger (optional)
if config.wandb.enabled:
    logger = WandbLogger(
        project=config.wandb.project,
        entity=config.wandb.entity,
        name=config.run_name,
        log_model=config.wandb.log_model,
    )
    logger.experiment.config.update(config.to_dict())
else:
    logger = None

In [None]:
# Initialize trainer
trainer = L.Trainer(
    max_epochs=config.training.num_epochs,
    callbacks=callbacks,
    logger=logger,
    gradient_clip_val=config.training.gradient_clip_val,
    precision=config.training.precision,
    val_check_interval=config.training.val_check_interval,
    log_every_n_steps=50,
    enable_progress_bar=True,
    enable_model_summary=True,
)

# Train
print("Starting training...")
trainer.fit(pl_module, datamodule=data_module)

print("Training completed!")
checkpoint_path = f"{config.checkpoint_dir}/{config.run_name}/best_model.ckpt"
print(f"Best checkpoint saved at: {checkpoint_path}")

## Test Inference

In [None]:
from tokenizers import Tokenizer

# Load trained model for inference
checkpoint_path = f"{config.checkpoint_dir}/{config.run_name}/best_model.ckpt"
model = LlamaLightningModule.load_from_checkpoint(checkpoint_path, config=config)
model.eval()
model.cuda()

# Load tokenizer
tokenizer = Tokenizer.from_file(str(tokenizer_path))

# Test generation
prompt = "The future of artificial intelligence"
encoding = tokenizer.encode(prompt)
input_ids = torch.tensor([encoding.ids]).cuda()

print(f"Prompt: {prompt}")
print("Generating...")

# Simple greedy generation
with torch.no_grad():
    for _ in range(50):
        outputs = model.model(input_ids)
        next_token = outputs[:, -1, :].argmax(dim=-1, keepdim=True)
        input_ids = torch.cat([input_ids, next_token], dim=1)

        if next_token.item() == config.inference.eos_idx:
            break

# Decode
generated_ids = input_ids[0].cpu().tolist()
generated_text = tokenizer.decode(generated_ids)
print(f"\nGenerated: {generated_text}")

## Download Checkpoint

Download the trained model checkpoint to use locally.

In [None]:
# Create archive for download
!tar -czf llama_model.tar.gz checkpoints/ tokenizers/
print("Model and tokenizer saved to llama_model.tar.gz")
print("Download this file from Kaggle Output section")