In [None]:
# ⚙️ Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda

print("⚠️  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
print("⚠️  Multi-GPU code requires multiple physical GPUs!")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: Device Query and Selection

### CUDA C++ Device Management (Primary)

```cpp
// device_query.cu - Enumerate and select GPUs
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    // ============================================
    // Query Available GPUs
    // ============================================
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    printf("Found %d CUDA device(s)\n", deviceCount);
    
    for (int i = 0; i < deviceCount; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        
        printf("\nDevice %d: %s\n", i, prop.name);
        printf("  Compute Capability: %d.%d\n", prop.major, prop.minor);
        printf("  Global Memory: %.2f GB\n", 
               prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
        printf("  SM Count: %d\n", prop.multiProcessorCount);
        printf("  Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
        printf("  Memory Clock: %.2f GHz\n", prop.memoryClockRate / 1e6);
        printf("  Memory Bus Width: %d bits\n", prop.memoryBusWidth);
    }
    
    // ============================================
    // Select a Device
    // ============================================
    int selectedDevice = 0;  // Use first GPU
    cudaSetDevice(selectedDevice);
    
    printf("\nSelected device %d for computation\n", selectedDevice);
    
    // ============================================
    // Query Current Device
    // ============================================
    int currentDevice;
    cudaGetDevice(&currentDevice);
    printf("Current device: %d\n", currentDevice);
    
    return 0;
}
```

---

## Part 2: Peer-to-Peer Access

### Direct GPU-to-GPU Communication

```
Without P2P:                   With P2P:
━━━━━━━━━━━━                   ━━━━━━━━━

GPU 0 ─────> Host ─────> GPU 1    GPU 0 ════════> GPU 1
       copy       copy                  direct copy

• 2x latency                   • 1x latency
• Host memory bottleneck       • PCIe/NVLink speed
• CPU involved                 • GPU-to-GPU direct
```

### CUDA C++ P2P Setup (Primary)

```cpp
// p2p_access.cu - Enable peer access between GPUs
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    if (deviceCount < 2) {
        printf("Need at least 2 GPUs for P2P demo\n");
        return 1;
    }
    
    int gpu0 = 0, gpu1 = 1;
    
    // ============================================
    // Check P2P Capability
    // ============================================
    int canAccessPeer0to1, canAccessPeer1to0;
    
    cudaDeviceCanAccessPeer(&canAccessPeer0to1, gpu0, gpu1);
    cudaDeviceCanAccessPeer(&canAccessPeer1to0, gpu1, gpu0);
    
    printf("P2P GPU %d -> GPU %d: %s\n", 
           gpu0, gpu1, canAccessPeer0to1 ? "YES" : "NO");
    printf("P2P GPU %d -> GPU %d: %s\n", 
           gpu1, gpu0, canAccessPeer1to0 ? "YES" : "NO");
    
    if (!canAccessPeer0to1 || !canAccessPeer1to0) {
        printf("P2P not supported between these GPUs\n");
        return 1;
    }
    
    // ============================================
    // Enable P2P Access
    // ============================================
    cudaSetDevice(gpu0);
    cudaDeviceEnablePeerAccess(gpu1, 0);  // Enable 0 -> 1
    
    cudaSetDevice(gpu1);
    cudaDeviceEnablePeerAccess(gpu0, 0);  // Enable 1 -> 0
    
    printf("P2P access enabled between GPU %d and GPU %d\n", gpu0, gpu1);
    
    // ============================================
    // Allocate Memory on Each GPU
    // ============================================
    const int N = 1 << 20;
    float *d_data0, *d_data1;
    
    cudaSetDevice(gpu0);
    cudaMalloc(&d_data0, N * sizeof(float));
    
    cudaSetDevice(gpu1);
    cudaMalloc(&d_data1, N * sizeof(float));
    
    // ============================================
    // Direct P2P Copy
    // ============================================
    cudaMemcpyPeer(d_data1, gpu1, d_data0, gpu0, N * sizeof(float));
    printf("Copied data directly from GPU %d to GPU %d\n", gpu0, gpu1);
    
    // ============================================
    // Cleanup
    // ============================================
    cudaSetDevice(gpu0);
    cudaDeviceDisablePeerAccess(gpu1);
    cudaFree(d_data0);
    
    cudaSetDevice(gpu1);
    cudaDeviceDisablePeerAccess(gpu0);
    cudaFree(d_data1);
    
    return 0;
}
```

---

## Part 3: Running Kernels on Multiple GPUs

### CUDA C++ Multi-GPU Kernel Execution (Primary)

```cpp
// multi_gpu_kernel.cu - Execute kernels on multiple GPUs
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void process(float* data, int n, int gpuId) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        data[tid] = data[tid] * 2.0f + (float)gpuId;
    }
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    if (deviceCount < 2) {
        printf("Need at least 2 GPUs\n");
        return 1;
    }
    
    const int N = 1 << 20;
    const int NUM_GPUS = 2;
    const int N_PER_GPU = N / NUM_GPUS;
    
    // Host data
    float* h_data;
    cudaMallocHost(&h_data, N * sizeof(float));
    for (int i = 0; i < N; i++) h_data[i] = 1.0f;
    
    // Device data and streams
    float* d_data[NUM_GPUS];
    cudaStream_t streams[NUM_GPUS];
    
    // ============================================
    // Setup Each GPU
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaMalloc(&d_data[gpu], N_PER_GPU * sizeof(float));
        cudaStreamCreate(&streams[gpu]);
    }
    
    // ============================================
    // Copy Data to Each GPU (Async)
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int offset = gpu * N_PER_GPU;
        cudaMemcpyAsync(d_data[gpu], h_data + offset,
                        N_PER_GPU * sizeof(float),
                        cudaMemcpyHostToDevice, streams[gpu]);
    }
    
    // ============================================
    // Launch Kernels on Each GPU
    // ============================================
    int blockSize = 256;
    int numBlocks = (N_PER_GPU + blockSize - 1) / blockSize;
    
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        process<<<numBlocks, blockSize, 0, streams[gpu]>>>(
            d_data[gpu], N_PER_GPU, gpu);
    }
    
    // ============================================
    // Copy Results Back (Async)
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int offset = gpu * N_PER_GPU;
        cudaMemcpyAsync(h_data + offset, d_data[gpu],
                        N_PER_GPU * sizeof(float),
                        cudaMemcpyDeviceToHost, streams[gpu]);
    }
    
    // ============================================
    // Synchronize All GPUs
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamSynchronize(streams[gpu]);
    }
    
    // Verify
    printf("GPU 0 result[0] = %.1f (expected 2.0)\n", h_data[0]);
    printf("GPU 1 result[0] = %.1f (expected 3.0)\n", h_data[N_PER_GPU]);
    
    // Cleanup
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaFree(d_data[gpu]);
        cudaStreamDestroy(streams[gpu]);
    }
    cudaFreeHost(h_data);
    
    printf("Multi-GPU computation complete!\n");
    return 0;
}
```

---

## Part 4: Unified Memory for Multi-GPU

### Automatic Data Migration

```cpp
// unified_multi_gpu.cu - Unified Memory with multiple GPUs
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void process(float* data, int start, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        data[start + tid] *= 2.0f;
    }
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    if (deviceCount < 2) {
        printf("Need 2+ GPUs\n");
        return 1;
    }
    
    const int N = 1 << 20;
    const int NUM_GPUS = 2;
    const int N_PER_GPU = N / NUM_GPUS;
    
    // ============================================
    // Allocate Unified Memory
    // ============================================
    float* data;
    cudaMallocManaged(&data, N * sizeof(float));
    
    // Initialize on host
    for (int i = 0; i < N; i++) data[i] = 1.0f;
    
    // ============================================
    // Give Hints About Data Location
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        int offset = gpu * N_PER_GPU;
        size_t size = N_PER_GPU * sizeof(float);
        
        // Hint: This data is preferred on this GPU
        cudaMemAdvise(data + offset, size, 
                      cudaMemAdviseSetPreferredLocation, gpu);
    }
    
    // ============================================
    // Prefetch Data to GPUs
    // ============================================
    cudaStream_t streams[NUM_GPUS];
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamCreate(&streams[gpu]);
        
        int offset = gpu * N_PER_GPU;
        cudaMemPrefetchAsync(data + offset, 
                             N_PER_GPU * sizeof(float),
                             gpu, streams[gpu]);
    }
    
    // ============================================
    // Launch Kernels
    // ============================================
    int blockSize = 256;
    int numBlocks = (N_PER_GPU + blockSize - 1) / blockSize;
    
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int offset = gpu * N_PER_GPU;
        
        process<<<numBlocks, blockSize, 0, streams[gpu]>>>(
            data, offset, N_PER_GPU);
    }
    
    // ============================================
    // Prefetch Back to Host
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int offset = gpu * N_PER_GPU;
        cudaMemPrefetchAsync(data + offset,
                             N_PER_GPU * sizeof(float),
                             cudaCpuDeviceId, streams[gpu]);
    }
    
    // Sync all
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamSynchronize(streams[gpu]);
    }
    
    // Verify on host
    printf("data[0] = %.1f (expected 2.0)\n", data[0]);
    printf("data[%d] = %.1f (expected 2.0)\n", N_PER_GPU, data[N_PER_GPU]);
    
    // Cleanup
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamDestroy(streams[gpu]);
    }
    cudaFree(data);
    
    return 0;
}
```

---

## Exercises

### Exercise 1: Device Query
Write code to find the GPU with the most memory.

### Exercise 2: P2P Bandwidth
Measure P2P copy bandwidth between two GPUs.

### Exercise 3: Multi-GPU Vector Add
Implement vector addition split across 2 GPUs.

---

## Key Takeaways

```
┌─────────────────────────────────────────────────────────┐
│                MULTI-GPU BASICS                         │
├─────────────────────────────────────────────────────────┤
│                                                         │
│  Device Management:                                     │
│  • cudaGetDeviceCount(&count)                           │
│  • cudaSetDevice(id)                                    │
│  • cudaGetDevice(&id)                                   │
│                                                         │
│  Peer Access:                                           │
│  • cudaDeviceCanAccessPeer(&can, dev, peer)             │
│  • cudaDeviceEnablePeerAccess(peer, 0)                  │
│  • cudaMemcpyPeer(dst, dstDev, src, srcDev, size)       │
│                                                         │
│  Pattern:                                               │
│  1. Query/select devices                                │
│  2. Enable P2P if available                             │
│  3. Allocate memory on each GPU                         │
│  4. Distribute data                                     │
│  5. Launch kernels                                      │
│  6. Collect results                                     │
│                                                         │
└─────────────────────────────────────────────────────────┘
```

## Next: Day 2 - Multi-GPU Patterns