# RTX-STone: Getting Started

Welcome to **RTX-STone**, PyTorch with native SM 12.0 (Blackwell) support for RTX 50-series GPUs!

This notebook will guide you through:
1. Verifying your installation
2. Checking GPU capabilities
3. Running basic PyTorch operations
4. Comparing performance with standard PyTorch

## Supported GPUs
- RTX 5090 (24GB)
- RTX 5080 (16GB)
- RTX 5070 Ti (16GB)
- RTX 5070 (12GB)
- All future RTX 50-series GPUs with SM 12.0 (Blackwell)

## 1. Import Libraries

In [None]:
import torch
import time
import numpy as np
import matplotlib.pyplot as plt

print(f"PyTorch Version: {torch.__version__}")

## 2. Verify CUDA and GPU

In [None]:
# Check CUDA availability
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"cuDNN Version: {torch.backends.cudnn.version()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

# GPU Information
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    compute_cap = torch.cuda.get_device_capability(0)
    gpu_props = torch.cuda.get_device_properties(0)
    
    print(f"\nGPU: {gpu_name}")
    print(f"Compute Capability: {compute_cap[0]}.{compute_cap[1]}")
    print(f"Total Memory: {gpu_props.total_memory / 1e9:.2f} GB")
    print(f"Multi Processors: {gpu_props.multi_processor_count}")
    print(f"Compiled Architectures: {torch.cuda.get_arch_list()}")
    
    # Check if RTX 50-series (SM 12.0)
    if compute_cap == (12, 0):
        print("\n✓ RTX 50-series GPU detected! Native SM 12.0 support active.")
    else:
        print(f"\n⚠ GPU has SM {compute_cap[0]}.{compute_cap[1]}, not SM 12.0 (RTX 50-series)")
        print("  Performance optimizations are designed for RTX 50-series GPUs.")
else:
    print("\n✗ No CUDA GPU detected")

## 3. Basic GPU Operations

In [None]:
# Create tensors on GPU
x = torch.randn(1000, 1000, device='cuda')
y = torch.randn(1000, 1000, device='cuda')

print(f"Tensor x shape: {x.shape}")
print(f"Tensor x device: {x.device}")
print(f"Tensor x dtype: {x.dtype}")

# Matrix multiplication
z = torch.matmul(x, y)
print(f"\nResult shape: {z.shape}")
print(f"Result device: {z.device}")

# Check GPU memory
print(f"\nGPU Memory Allocated: {torch.cuda.memory_allocated() / 1e6:.2f} MB")
print(f"GPU Memory Reserved: {torch.cuda.memory_reserved() / 1e6:.2f} MB")

## 4. Performance Benchmark: Matrix Multiplication

In [None]:
def benchmark_matmul(size, dtype=torch.float32, iterations=100):
    """Benchmark matrix multiplication."""
    x = torch.randn(size, size, device='cuda', dtype=dtype)
    y = torch.randn(size, size, device='cuda', dtype=dtype)
    
    # Warmup
    for _ in range(10):
        torch.matmul(x, y)
    torch.cuda.synchronize()
    
    # Benchmark
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    
    times = []
    for _ in range(iterations):
        start.record()
        z = torch.matmul(x, y)
        end.record()
        torch.cuda.synchronize()
        times.append(start.elapsed_time(end))
    
    mean_time = np.mean(times)
    std_time = np.std(times)
    
    # Calculate TFLOPS
    flops = 2 * size ** 3  # Matrix multiplication: 2*N^3 FLOPs
    tflops = (flops / mean_time / 1e9)  # Convert to TFLOPS
    
    return mean_time, std_time, tflops

# Benchmark different sizes
sizes = [512, 1024, 2048, 4096]
results = {}

print("Benchmarking Matrix Multiplication (FP32)...\n")
print(f"{'Size':<10} {'Time (ms)':<15} {'TFLOPS':<10}")
print("-" * 40)

for size in sizes:
    mean_time, std_time, tflops = benchmark_matmul(size, dtype=torch.float32, iterations=50)
    results[size] = {'time': mean_time, 'tflops': tflops}
    print(f"{size:<10} {mean_time:>7.3f} ± {std_time:<4.2f}  {tflops:>7.2f}")

## 5. Compare Different Precisions

In [None]:
def compare_precisions(size=2048, iterations=50):
    """Compare performance across different precisions."""
    precisions = {
        'FP32': torch.float32,
        'FP16': torch.float16,
        'BF16': torch.bfloat16,
    }
    
    results = {}
    
    print(f"Comparing Precisions (size={size})\n")
    print(f"{'Precision':<10} {'Time (ms)':<15} {'TFLOPS':<10} {'Speedup':<10}")
    print("-" * 50)
    
    fp32_time = None
    
    for name, dtype in precisions.items():
        mean_time, std_time, tflops = benchmark_matmul(size, dtype=dtype, iterations=iterations)
        results[name] = {'time': mean_time, 'tflops': tflops}
        
        if name == 'FP32':
            fp32_time = mean_time
            speedup = 1.0
        else:
            speedup = fp32_time / mean_time
        
        print(f"{name:<10} {mean_time:>7.3f} ± {std_time:<4.2f}  {tflops:>7.2f}   {speedup:>6.2f}x")
    
    return results

precision_results = compare_precisions()

## 6. Visualize Performance

In [None]:
# Plot performance vs matrix size
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Time vs Size
sizes_list = list(results.keys())
times = [results[s]['time'] for s in sizes_list]
ax1.plot(sizes_list, times, 'o-', linewidth=2, markersize=8)
ax1.set_xlabel('Matrix Size', fontsize=12)
ax1.set_ylabel('Time (ms)', fontsize=12)
ax1.set_title('Matrix Multiplication Performance (FP32)', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)

# Plot 2: TFLOPS vs Size
tflops_list = [results[s]['tflops'] for s in sizes_list]
ax2.plot(sizes_list, tflops_list, 'o-', color='green', linewidth=2, markersize=8)
ax2.set_xlabel('Matrix Size', fontsize=12)
ax2.set_ylabel('TFLOPS', fontsize=12)
ax2.set_title('Throughput (TFLOPS)', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Plot precision comparison
fig, ax = plt.subplots(figsize=(10, 6))
precisions_list = list(precision_results.keys())
precision_tflops = [precision_results[p]['tflops'] for p in precisions_list]

bars = ax.bar(precisions_list, precision_tflops, color=['#3498db', '#e74c3c', '#2ecc71'])
ax.set_ylabel('TFLOPS', fontsize=12)
ax.set_title('Performance Across Precisions (2048x2048)', fontsize=14, fontweight='bold')
ax.grid(True, axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}',
            ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

## 7. Test Different Operations

In [None]:
def benchmark_operation(operation_name, operation_fn, iterations=100):
    """Benchmark a generic operation."""
    # Warmup
    for _ in range(10):
        operation_fn()
    torch.cuda.synchronize()
    
    # Benchmark
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    
    start.record()
    for _ in range(iterations):
        operation_fn()
    end.record()
    torch.cuda.synchronize()
    
    total_time = start.elapsed_time(end)
    mean_time = total_time / iterations
    
    return mean_time

# Test various operations
size = 2048
x = torch.randn(size, size, device='cuda')
y = torch.randn(size, size, device='cuda')

operations = {
    'Matrix Multiply': lambda: torch.matmul(x, y),
    'Element-wise Multiply': lambda: x * y,
    'Element-wise Add': lambda: x + y,
    'ReLU': lambda: torch.relu(x),
    'Softmax': lambda: torch.softmax(x, dim=-1),
    'LayerNorm': lambda: torch.nn.functional.layer_norm(x, (size,)),
}

print(f"Benchmarking Various Operations (size={size})\n")
print(f"{'Operation':<25} {'Time (ms)':<15}")
print("-" * 40)

for name, op in operations.items():
    time = benchmark_operation(name, op)
    print(f"{name:<25} {time:>10.4f}")

## 8. Memory Management

In [None]:
# Check GPU memory usage
print("GPU Memory Usage:\n")
print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.3f} GB")
print(f"Reserved:  {torch.cuda.memory_reserved() / 1e9:.3f} GB")
print(f"Max Allocated: {torch.cuda.max_memory_allocated() / 1e9:.3f} GB")

# Clear cache
torch.cuda.empty_cache()

print("\nAfter clearing cache:")
print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.3f} GB")
print(f"Reserved:  {torch.cuda.memory_reserved() / 1e9:.3f} GB")

# GPU properties
props = torch.cuda.get_device_properties(0)
print(f"\nTotal GPU Memory: {props.total_memory / 1e9:.2f} GB")
print(f"Available: {(props.total_memory - torch.cuda.memory_allocated()) / 1e9:.2f} GB")

## Next Steps

Now that you've verified your installation, check out:

1. **02_Flash_Attention.ipynb** - Learn about Flash Attention 2 optimization
2. **03_Custom_Triton_Kernels.ipynb** - Write custom CUDA kernels in Python
3. **04_LLM_Optimization.ipynb** - Optimize large language models
4. **05_Image_Generation.ipynb** - Optimize Stable Diffusion and FLUX

## Resources

- [GitHub Repository](https://github.com/kentstone84/pytorch-rtx5080-support)
- [PyTorch Documentation](https://pytorch.org/docs/)
- [Triton Documentation](https://triton-lang.org/)

---

**RTX-STone** - Unleash the power of your RTX 50-series GPU!