# GPU Smoke Test

Interactive GPU verification and quick benchmark for aws-bootstrap instances.

Run each cell top-to-bottom to verify the CUDA stack, exercise FP32/FP16 operations,
train a small CNN on MNIST, and visualise loss and memory usage.

For the full CLI benchmark (CNN + Transformer, configurable precision/batch-size),
run `python ~/gpu_benchmark.py` from a terminal.

In [None]:
import sys

import torch


print(f"Python  : {sys.version.split()[0]}")
print(f"PyTorch : {torch.__version__}")
print(f"CUDA    : {torch.version.cuda}")
print(f"cuDNN   : {torch.backends.cudnn.version()}")

assert torch.cuda.is_available(), "CUDA is not available!"

props = torch.cuda.get_device_properties(0)
print(f"\nGPU           : {props.name}")
print(f"Compute cap.  : {props.major}.{props.minor}")
print(f"Total memory  : {props.total_memory / (1024**3):.1f} GB")
print(f"SM count      : {props.multi_processor_count}")

## CUDA Smoke Tests

In [None]:
import torch


N = 1024

# --- FP32 matmul ---
a32 = torch.randn(N, N, device="cuda")
b32 = torch.randn(N, N, device="cuda")

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)

start.record()
c32 = torch.mm(a32, b32)
end.record()
torch.cuda.synchronize()
fp32_ms = start.elapsed_time(end)
print(f"FP32 matmul ({N}x{N}): {fp32_ms:.2f} ms")

# --- FP16 matmul ---
a16 = a32.half()
b16 = b32.half()

start.record()
c16 = torch.mm(a16, b16)
end.record()
torch.cuda.synchronize()
fp16_ms = start.elapsed_time(end)
print(f"FP16 matmul ({N}x{N}): {fp16_ms:.2f} ms")

# Correctness check: FP16 result should be close to FP32
diff = (c32 - c16.float()).abs().max().item()
print(f"Max abs diff FP32 vs FP16: {diff:.4f}")
assert diff < N, f"Unexpectedly large diff: {diff}"  # loose bound
print("PASSED")

In [None]:
import torch
import torch.nn as nn


# AMP autocast: Linear + Conv2d
linear = nn.Linear(512, 512).cuda()
conv = nn.Conv2d(3, 64, 3, padding=1).cuda()

x_lin = torch.randn(32, 512, device="cuda")
x_conv = torch.randn(4, 3, 32, 32, device="cuda")

with torch.amp.autocast(device_type="cuda"):
    y_lin = linear(x_lin)
    y_conv = conv(x_conv)

torch.cuda.synchronize()
print(f"Linear  output: {y_lin.shape}, dtype={y_lin.dtype}")
print(f"Conv2d  output: {y_conv.shape}, dtype={y_conv.dtype}")
print("AMP autocast PASSED")

In [None]:
import torch


torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()

before = torch.cuda.memory_allocated()
big = torch.randn(4096, 4096, device="cuda")  # ~64 MB
allocated = torch.cuda.memory_allocated()
total = torch.cuda.get_device_properties(0).total_memory

print(f"Before alloc : {before / 1e6:.1f} MB")
print(f"After alloc  : {allocated / 1e6:.1f} MB")
print(f"Total GPU mem: {total / 1e9:.1f} GB")

del big
torch.cuda.empty_cache()
after_free = torch.cuda.memory_allocated()
print(f"After free   : {after_free / 1e6:.1f} MB")
assert after_free <= before + 1e6, "Memory not freed!"
print("Memory alloc/free PASSED")

## Quick Training Benchmark

Train a small CNN on MNIST for 5 epochs and collect the loss per batch.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms


class MNISTConvNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d(1),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, 10),
        )

    def forward(self, x):
        return self.classifier(self.features(x))


device = torch.device("cuda")
torch.cuda.reset_peak_memory_stats()

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_set = datasets.MNIST("/tmp/data", train=True, download=True, transform=transform)
loader = DataLoader(train_set, batch_size=256, shuffle=True, num_workers=2, pin_memory=True)

model = MNISTConvNet().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scaler = torch.amp.GradScaler("cuda")

NUM_EPOCHS = 5
losses = []

for epoch in range(1, NUM_EPOCHS + 1):
    model.train()
    epoch_loss = 0.0
    for images, labels in loader:
        images, labels = images.to(device, non_blocking=True), labels.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        with torch.amp.autocast(device_type="cuda"):
            loss = F.cross_entropy(model(images), labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        losses.append(loss.item())
        epoch_loss += loss.item()
    avg = epoch_loss / len(loader)
    print(f"Epoch {epoch}/{NUM_EPOCHS}  avg loss: {avg:.4f}")

peak_mb = torch.cuda.max_memory_allocated() / (1024**2)
print(f"\nPeak GPU memory during training: {peak_mb:.0f} MB")
print(f"Total batches: {len(losses)}")

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(10, 4))
plt.plot(losses, linewidth=0.8, alpha=0.7)
plt.xlabel("Batch")
plt.ylabel("Cross-Entropy Loss")
plt.title("MNIST CNN Training Loss")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## GPU Memory & Utilization

In [None]:
import matplotlib.pyplot as plt
import torch


peak_mb = torch.cuda.max_memory_allocated() / (1024**2)
total_mb = torch.cuda.get_device_properties(0).total_memory / (1024**2)
free_mb = total_mb - peak_mb

fig, ax = plt.subplots(figsize=(6, 4))
bars = ax.bar(["Peak Used", "Remaining"], [peak_mb, free_mb], color=["#e74c3c", "#2ecc71"])
ax.set_ylabel("MB")
ax.set_title(f"GPU Memory: {peak_mb:.0f} MB peak / {total_mb:.0f} MB total")
for bar in bars:
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        bar.get_height() + 50,
        f"{bar.get_height():.0f}",
        ha="center",
        va="bottom",
        fontsize=11,
    )
plt.tight_layout()
plt.show()

## Summary

If all cells above ran without error, the CUDA stack is healthy and the GPU is
ready for training workloads.

### Next steps

- **Full benchmark** (CNN + Transformer, configurable precision): `python ~/gpu_benchmark.py`
- **Jupyter tips**: use `!nvidia-smi` in a cell to check GPU utilisation at any time
- **VSCode Remote SSH**: connect with `ssh aws-gpu<N>` for a full IDE experience