# Multi-GPU Training Guide for AI Lab Platform

This notebook demonstrates how to effectively use all 4 NVIDIA RTX 2080 Ti GPUs for deep learning training.

## Contents
1. GPU Setup and Verification
2. DataParallel Training (Easy Mode)
3. DistributedDataParallel Training (Performance Mode)
4. Multi-GPU Best Practices
5. Monitoring and Debugging
6. Real-World Example: Training ResNet50 on ImageNet

## 1. GPU Setup and Verification

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import time
import os

# Check GPU availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

# List all GPUs
for i in range(torch.cuda.device_count()):
    print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.1f} GB")
    print(f"  Capability: {torch.cuda.get_device_properties(i).major}.{torch.cuda.get_device_properties(i).minor}")

## 2. DataParallel Training (Easy Mode)

DataParallel is the easiest way to use multiple GPUs. It automatically splits your data across GPUs and gathers the results.

In [None]:
# Define a simple model
class SimpleModel(nn.Module):
    def __init__(self, input_size=1024, hidden_size=2048, output_size=10):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, output_size)
        )
    
    def forward(self, x):
        return self.layers(x)

# Create model and wrap with DataParallel
model = SimpleModel()
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs with DataParallel")
    model = nn.DataParallel(model)
model = model.cuda()

print(f"Model created and moved to GPU(s)")

In [None]:
# Create synthetic dataset for demonstration
def create_synthetic_dataset(num_samples=10000, input_size=1024, num_classes=10):
    X = torch.randn(num_samples, input_size)
    y = torch.randint(0, num_classes, (num_samples,))
    return TensorDataset(X, y)

# Create datasets
train_dataset = create_synthetic_dataset(10000)
val_dataset = create_synthetic_dataset(2000)

# Create data loaders with larger batch size for multi-GPU
# Rule of thumb: batch_size = single_gpu_batch_size * num_gpus
batch_size = 256 * torch.cuda.device_count()
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

print(f"Using batch size: {batch_size}")

In [None]:
# Training function with GPU utilization monitoring
def train_epoch(model, loader, criterion, optimizer, epoch):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    start_time = time.time()
    
    for batch_idx, (data, target) in enumerate(loader):
        data, target = data.cuda(), target.cuda()
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = output.max(1)
        total += target.size(0)
        correct += predicted.eq(target).sum().item()
        
        if batch_idx % 10 == 0:
            # Monitor GPU memory usage
            if torch.cuda.is_available():
                for i in range(torch.cuda.device_count()):
                    allocated = torch.cuda.memory_allocated(i) / 1024**3
                    reserved = torch.cuda.memory_reserved(i) / 1024**3
                    print(f"GPU {i}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved", end=" | ")
            print(f"\nBatch {batch_idx}/{len(loader)}, Loss: {loss.item():.4f}")
    
    epoch_time = time.time() - start_time
    avg_loss = total_loss / len(loader)
    accuracy = 100. * correct / total
    
    print(f"\nEpoch {epoch} completed in {epoch_time:.1f}s")
    print(f"Average Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")
    print(f"Training speed: {len(loader.dataset) / epoch_time:.1f} samples/second")
    
    return avg_loss, accuracy

In [None]:
# Train the model
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print("Starting DataParallel training...\n")
for epoch in range(3):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, epoch + 1)

## 3. DistributedDataParallel Training (Performance Mode)

DistributedDataParallel (DDP) is more efficient than DataParallel, especially for larger models. Save the following script and run it with `torchrun`.

In [None]:
# Example DDP script - save as train_ddp.py and run with:
# torchrun --nproc_per_node=4 train_ddp.py

print('''
Save the following as train_ddp.py:

import torch
import torch.nn as nn
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import os

def setup():
    dist.init_process_group("nccl")
    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))

def cleanup():
    dist.destroy_process_group()

def train():
    setup()
    rank = dist.get_rank()
    device = rank % torch.cuda.device_count()
    
    model = YourModel().to(device)
    ddp_model = DDP(model, device_ids=[device])
    
    # Training loop here
    
    cleanup()

if __name__ == "__main__":
    train()
''')

## 4. Multi-GPU Best Practices

In [None]:
# Best Practice: Mixed Precision Training
from torch.cuda.amp import GradScaler, autocast

def train_mixed_precision(model, loader, criterion, optimizer):
    """Use mixed precision to speed up training and reduce memory usage."""
    model.train()
    scaler = GradScaler()
    
    for batch_idx, (data, target) in enumerate(loader):
        data, target = data.cuda(), target.cuda()
        
        optimizer.zero_grad()
        
        # Use autocast for mixed precision
        with autocast():
            output = model(data)
            loss = criterion(output, target)
        
        # Scale loss and backward
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        if batch_idx % 10 == 0:
            print(f"Batch {batch_idx}, Loss: {loss.item():.4f}")
        
        # Only run a few batches for demo
        if batch_idx >= 5:
            break

print("Mixed precision training example:")
train_mixed_precision(model, train_loader, criterion, optimizer)

## 5. Monitoring and Debugging

In [None]:
# GPU Memory and Utilization Monitor
def get_gpu_memory_info():
    """Get current GPU memory usage"""
    for i in range(torch.cuda.device_count()):
        allocated = torch.cuda.memory_allocated(i) / 1024**3
        reserved = torch.cuda.memory_reserved(i) / 1024**3
        total = torch.cuda.get_device_properties(i).total_memory / 1024**3
        
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  Allocated: {allocated:.2f} GB")
        print(f"  Reserved:  {reserved:.2f} GB")
        print(f"  Total:     {total:.2f} GB")
        print(f"  Free:      {total - reserved:.2f} GB")
        print()

get_gpu_memory_info()

## Summary

### Key Takeaways:
1. **DataParallel** - Easy to use, good for prototyping
2. **DistributedDataParallel** - Better performance, recommended for production
3. **Mixed Precision** - 2x speedup with minimal code changes
4. **Batch Size** - Scale linearly with number of GPUs

### Your RTX 2080 Ti Setup:
- 4 GPUs × 11GB = 44GB total VRAM
- Excellent for most deep learning models
- Use mixed precision for best performance

Happy training! 🚀