In [None]:
# ‚öôÔ∏è Setup
import subprocess, sys
try:
    import google.colab
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "numba"])
except ImportError:
    pass

import numpy as np
from numba import cuda

print("‚ö†Ô∏è  CUDA C++ is PRIMARY. Python/Numba for quick testing only.")
print("‚ö†Ô∏è  Multi-GPU code requires multiple physical GPUs!")
if cuda.is_available():
    print(f"GPU: {cuda.get_current_device().name}")

---

## Part 1: Device Query and Selection

### üî∑ CUDA C++ Device Management (Primary)

```cpp
// device_query.cu - Enumerate and select GPUs
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    // ============================================
    // Query Available GPUs
    // ============================================
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    printf("Found %d CUDA device(s)\n", deviceCount);
    
    for (int i = 0; i < deviceCount; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        
        printf("\nDevice %d: %s\n", i, prop.name);
        printf("  Compute Capability: %d.%d\n", prop.major, prop.minor);
        printf("  Global Memory: %.2f GB\n", 
               prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
        printf("  SM Count: %d\n", prop.multiProcessorCount);
        printf("  Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
        printf("  Memory Clock: %.2f GHz\n", prop.memoryClockRate / 1e6);
        printf("  Memory Bus Width: %d bits\n", prop.memoryBusWidth);
    }
    
    // ============================================
    // Select a Device
    // ============================================
    int selectedDevice = 0;  // Use first GPU
    cudaSetDevice(selectedDevice);
    
    printf("\nSelected device %d for computation\n", selectedDevice);
    
    // ============================================
    // Query Current Device
    // ============================================
    int currentDevice;
    cudaGetDevice(&currentDevice);
    printf("Current device: %d\n", currentDevice);
    
    return 0;
}
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile device_query.cu
// device_query.cu - Enumerate and select GPUs
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    // ============================================
    // Query Available GPUs
    // ============================================
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    printf("Found %d CUDA device(s)\n", deviceCount);
    
    for (int i = 0; i < deviceCount; i++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, i);
        
        printf("\nDevice %d: %s\n", i, prop.name);
        printf("  Compute Capability: %d.%d\n", prop.major, prop.minor);
        printf("  Global Memory: %.2f GB\n", 
               prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
        printf("  SM Count: %d\n", prop.multiProcessorCount);
        printf("  Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
        printf("  Memory Clock: %.2f GHz\n", prop.memoryClockRate / 1e6);
        printf("  Memory Bus Width: %d bits\n", prop.memoryBusWidth);
    }
    
    // ============================================
    // Select a Device
    // ============================================
    int selectedDevice = 0;  // Use first GPU
    cudaSetDevice(selectedDevice);
    
    printf("\nSelected device %d for computation\n", selectedDevice);
    
    // ============================================
    // Query Current Device
    // ============================================
    int currentDevice;
    cudaGetDevice(&currentDevice);
    printf("Current device: %d\n", currentDevice);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o device_query device_query.cu
!./device_query

---

## Part 2: Peer-to-Peer Access

### Direct GPU-to-GPU Communication

```
Without P2P:                   With P2P:
‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ                   ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

GPU 0 ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ> Host ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ> GPU 1    GPU 0 ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê> GPU 1
       copy       copy                  direct copy

‚Ä¢ 2x latency                   ‚Ä¢ 1x latency
‚Ä¢ Host memory bottleneck       ‚Ä¢ PCIe/NVLink speed
‚Ä¢ CPU involved                 ‚Ä¢ GPU-to-GPU direct
```

### üî∑ CUDA C++ P2P Setup (Primary)

```cpp
// p2p_access.cu - Enable peer access between GPUs
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    if (deviceCount < 2) {
        printf("Need at least 2 GPUs for P2P demo\n");
        return 1;
    }
    
    int gpu0 = 0, gpu1 = 1;
    
    // ============================================
    // Check P2P Capability
    // ============================================
    int canAccessPeer0to1, canAccessPeer1to0;
    
    cudaDeviceCanAccessPeer(&canAccessPeer0to1, gpu0, gpu1);
    cudaDeviceCanAccessPeer(&canAccessPeer1to0, gpu1, gpu0);
    
    printf("P2P GPU %d -> GPU %d: %s\n", 
           gpu0, gpu1, canAccessPeer0to1 ? "YES" : "NO");
    printf("P2P GPU %d -> GPU %d: %s\n", 
           gpu1, gpu0, canAccessPeer1to0 ? "YES" : "NO");
    
    if (!canAccessPeer0to1 || !canAccessPeer1to0) {
        printf("P2P not supported between these GPUs\n");
        return 1;
    }
    
    // ============================================
    // Enable P2P Access
    // ============================================
    cudaSetDevice(gpu0);
    cudaDeviceEnablePeerAccess(gpu1, 0);  // Enable 0 -> 1
    
    cudaSetDevice(gpu1);
    cudaDeviceEnablePeerAccess(gpu0, 0);  // Enable 1 -> 0
    
    printf("P2P access enabled between GPU %d and GPU %d\n", gpu0, gpu1);
    
    // ============================================
    // Allocate Memory on Each GPU
    // ============================================
    const int N = 1 << 20;
    float *d_data0, *d_data1;
    
    cudaSetDevice(gpu0);
    cudaMalloc(&d_data0, N * sizeof(float));
    
    cudaSetDevice(gpu1);
    cudaMalloc(&d_data1, N * sizeof(float));
    
    // ============================================
    // Direct P2P Copy
    // ============================================
    cudaMemcpyPeer(d_data1, gpu1, d_data0, gpu0, N * sizeof(float));
    printf("Copied data directly from GPU %d to GPU %d\n", gpu0, gpu1);
    
    // ============================================
    // Cleanup
    // ============================================
    cudaSetDevice(gpu0);
    cudaDeviceDisablePeerAccess(gpu1);
    cudaFree(d_data0);
    
    cudaSetDevice(gpu1);
    cudaDeviceDisablePeerAccess(gpu0);
    cudaFree(d_data1);
    
    return 0;
}
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile p2p_access.cu
// p2p_access.cu - Enable peer access between GPUs
#include <stdio.h>
#include <cuda_runtime.h>

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    if (deviceCount < 2) {
        printf("Need at least 2 GPUs for P2P demo\n");
        return 1;
    }
    
    int gpu0 = 0, gpu1 = 1;
    
    // ============================================
    // Check P2P Capability
    // ============================================
    int canAccessPeer0to1, canAccessPeer1to0;
    
    cudaDeviceCanAccessPeer(&canAccessPeer0to1, gpu0, gpu1);
    cudaDeviceCanAccessPeer(&canAccessPeer1to0, gpu1, gpu0);
    
    printf("P2P GPU %d -> GPU %d: %s\n", 
           gpu0, gpu1, canAccessPeer0to1 ? "YES" : "NO");
    printf("P2P GPU %d -> GPU %d: %s\n", 
           gpu1, gpu0, canAccessPeer1to0 ? "YES" : "NO");
    
    if (!canAccessPeer0to1 || !canAccessPeer1to0) {
        printf("P2P not supported between these GPUs\n");
        return 1;
    }
    
    // ============================================
    // Enable P2P Access
    // ============================================
    cudaSetDevice(gpu0);
    cudaDeviceEnablePeerAccess(gpu1, 0);  // Enable 0 -> 1
    
    cudaSetDevice(gpu1);
    cudaDeviceEnablePeerAccess(gpu0, 0);  // Enable 1 -> 0
    
    printf("P2P access enabled between GPU %d and GPU %d\n", gpu0, gpu1);
    
    // ============================================
    // Allocate Memory on Each GPU
    // ============================================
    const int N = 1 << 20;
    float *d_data0, *d_data1;
    
    cudaSetDevice(gpu0);
    cudaMalloc(&d_data0, N * sizeof(float));
    
    cudaSetDevice(gpu1);
    cudaMalloc(&d_data1, N * sizeof(float));
    
    // ============================================
    // Direct P2P Copy
    // ============================================
    cudaMemcpyPeer(d_data1, gpu1, d_data0, gpu0, N * sizeof(float));
    printf("Copied data directly from GPU %d to GPU %d\n", gpu0, gpu1);
    
    // ============================================
    // Cleanup
    // ============================================
    cudaSetDevice(gpu0);
    cudaDeviceDisablePeerAccess(gpu1);
    cudaFree(d_data0);
    
    cudaSetDevice(gpu1);
    cudaDeviceDisablePeerAccess(gpu0);
    cudaFree(d_data1);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o p2p_access p2p_access.cu
!./p2p_access

---

## Part 3: Running Kernels on Multiple GPUs

### üî∑ CUDA C++ Multi-GPU Kernel Execution (Primary)

```cpp
// multi_gpu_kernel.cu - Execute kernels on multiple GPUs
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void process(float* data, int n, int gpuId) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        data[tid] = data[tid] * 2.0f + (float)gpuId;
    }
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    if (deviceCount < 2) {
        printf("Need at least 2 GPUs\n");
        return 1;
    }
    
    const int N = 1 << 20;
    const int NUM_GPUS = 2;
    const int N_PER_GPU = N / NUM_GPUS;
    
    // Host data
    float* h_data;
    cudaMallocHost(&h_data, N * sizeof(float));
    for (int i = 0; i < N; i++) h_data[i] = 1.0f;
    
    // Device data and streams
    float* d_data[NUM_GPUS];
    cudaStream_t streams[NUM_GPUS];
    
    // ============================================
    // Setup Each GPU
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaMalloc(&d_data[gpu], N_PER_GPU * sizeof(float));
        cudaStreamCreate(&streams[gpu]);
    }
    
    // ============================================
    // Copy Data to Each GPU (Async)
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int offset = gpu * N_PER_GPU;
        cudaMemcpyAsync(d_data[gpu], h_data + offset,
                        N_PER_GPU * sizeof(float),
                        cudaMemcpyHostToDevice, streams[gpu]);
    }
    
    // ============================================
    // Launch Kernels on Each GPU
    // ============================================
    int blockSize = 256;
    int numBlocks = (N_PER_GPU + blockSize - 1) / blockSize;
    
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        process<<<numBlocks, blockSize, 0, streams[gpu]>>>(
            d_data[gpu], N_PER_GPU, gpu);
    }
    
    // ============================================
    // Copy Results Back (Async)
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int offset = gpu * N_PER_GPU;
        cudaMemcpyAsync(h_data + offset, d_data[gpu],
                        N_PER_GPU * sizeof(float),
                        cudaMemcpyDeviceToHost, streams[gpu]);
    }
    
    // ============================================
    // Synchronize All GPUs
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamSynchronize(streams[gpu]);
    }
    
    // Verify
    printf("GPU 0 result[0] = %.1f (expected 2.0)\n", h_data[0]);
    printf("GPU 1 result[0] = %.1f (expected 3.0)\n", h_data[N_PER_GPU]);
    
    // Cleanup
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaFree(d_data[gpu]);
        cudaStreamDestroy(streams[gpu]);
    }
    cudaFreeHost(h_data);
    
    printf("Multi-GPU computation complete!\n");
    return 0;
}
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile multi_gpu_kernel.cu
// multi_gpu_kernel.cu - Execute kernels on multiple GPUs
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void process(float* data, int n, int gpuId) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        data[tid] = data[tid] * 2.0f + (float)gpuId;
    }
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    if (deviceCount < 2) {
        printf("Need at least 2 GPUs\n");
        return 1;
    }
    
    const int N = 1 << 20;
    const int NUM_GPUS = 2;
    const int N_PER_GPU = N / NUM_GPUS;
    
    // Host data
    float* h_data;
    cudaMallocHost(&h_data, N * sizeof(float));
    for (int i = 0; i < N; i++) h_data[i] = 1.0f;
    
    // Device data and streams
    float* d_data[NUM_GPUS];
    cudaStream_t streams[NUM_GPUS];
    
    // ============================================
    // Setup Each GPU
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaMalloc(&d_data[gpu], N_PER_GPU * sizeof(float));
        cudaStreamCreate(&streams[gpu]);
    }
    
    // ============================================
    // Copy Data to Each GPU (Async)
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int offset = gpu * N_PER_GPU;
        cudaMemcpyAsync(d_data[gpu], h_data + offset,
                        N_PER_GPU * sizeof(float),
                        cudaMemcpyHostToDevice, streams[gpu]);
    }
    
    // ============================================
    // Launch Kernels on Each GPU
    // ============================================
    int blockSize = 256;
    int numBlocks = (N_PER_GPU + blockSize - 1) / blockSize;
    
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        process<<<numBlocks, blockSize, 0, streams[gpu]>>>(
            d_data[gpu], N_PER_GPU, gpu);
    }
    
    // ============================================
    // Copy Results Back (Async)
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int offset = gpu * N_PER_GPU;
        cudaMemcpyAsync(h_data + offset, d_data[gpu],
                        N_PER_GPU * sizeof(float),
                        cudaMemcpyDeviceToHost, streams[gpu]);
    }
    
    // ============================================
    // Synchronize All GPUs
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamSynchronize(streams[gpu]);
    }
    
    // Verify
    printf("GPU 0 result[0] = %.1f (expected 2.0)\n", h_data[0]);
    printf("GPU 1 result[0] = %.1f (expected 3.0)\n", h_data[N_PER_GPU]);
    
    // Cleanup
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaFree(d_data[gpu]);
        cudaStreamDestroy(streams[gpu]);
    }
    cudaFreeHost(h_data);
    
    printf("Multi-GPU computation complete!\n");
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o multi_gpu_kernel multi_gpu_kernel.cu
!./multi_gpu_kernel

---

## Part 4: Unified Memory for Multi-GPU

### Automatic Data Migration

```cpp
// unified_multi_gpu.cu - Unified Memory with multiple GPUs
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void process(float* data, int start, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        data[start + tid] *= 2.0f;
    }
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    if (deviceCount < 2) {
        printf("Need 2+ GPUs\n");
        return 1;
    }
    
    const int N = 1 << 20;
    const int NUM_GPUS = 2;
    const int N_PER_GPU = N / NUM_GPUS;
    
    // ============================================
    // Allocate Unified Memory
    // ============================================
    float* data;
    cudaMallocManaged(&data, N * sizeof(float));
    
    // Initialize on host
    for (int i = 0; i < N; i++) data[i] = 1.0f;
    
    // ============================================
    // Give Hints About Data Location
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        int offset = gpu * N_PER_GPU;
        size_t size = N_PER_GPU * sizeof(float);
        
        // Hint: This data is preferred on this GPU
        cudaMemAdvise(data + offset, size, 
                      cudaMemAdviseSetPreferredLocation, gpu);
    }
    
    // ============================================
    // Prefetch Data to GPUs
    // ============================================
    cudaStream_t streams[NUM_GPUS];
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamCreate(&streams[gpu]);
        
        int offset = gpu * N_PER_GPU;
        cudaMemPrefetchAsync(data + offset, 
                             N_PER_GPU * sizeof(float),
                             gpu, streams[gpu]);
    }
    
    // ============================================
    // Launch Kernels
    // ============================================
    int blockSize = 256;
    int numBlocks = (N_PER_GPU + blockSize - 1) / blockSize;
    
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int offset = gpu * N_PER_GPU;
        
        process<<<numBlocks, blockSize, 0, streams[gpu]>>>(
            data, offset, N_PER_GPU);
    }
    
    // ============================================
    // Prefetch Back to Host
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int offset = gpu * N_PER_GPU;
        cudaMemPrefetchAsync(data + offset,
                             N_PER_GPU * sizeof(float),
                             cudaCpuDeviceId, streams[gpu]);
    }
    
    // Sync all
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamSynchronize(streams[gpu]);
    }
    
    // Verify on host
    printf("data[0] = %.1f (expected 2.0)\n", data[0]);
    printf("data[%d] = %.1f (expected 2.0)\n", N_PER_GPU, data[N_PER_GPU]);
    
    // Cleanup
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamDestroy(streams[gpu]);
    }
    cudaFree(data);
    
    return 0;
}
```

### üî∑ CUDA C++ Implementation (Primary)

In [None]:
%%writefile unified_multi_gpu.cu
// unified_multi_gpu.cu - Unified Memory with multiple GPUs
#include <stdio.h>
#include <cuda_runtime.h>

__global__ void process(float* data, int start, int n) {
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid < n) {
        data[start + tid] *= 2.0f;
    }
}

int main() {
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    if (deviceCount < 2) {
        printf("Need 2+ GPUs\n");
        return 1;
    }
    
    const int N = 1 << 20;
    const int NUM_GPUS = 2;
    const int N_PER_GPU = N / NUM_GPUS;
    
    // ============================================
    // Allocate Unified Memory
    // ============================================
    float* data;
    cudaMallocManaged(&data, N * sizeof(float));
    
    // Initialize on host
    for (int i = 0; i < N; i++) data[i] = 1.0f;
    
    // ============================================
    // Give Hints About Data Location
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        int offset = gpu * N_PER_GPU;
        size_t size = N_PER_GPU * sizeof(float);
        
        // Hint: This data is preferred on this GPU
        cudaMemAdvise(data + offset, size, 
                      cudaMemAdviseSetPreferredLocation, gpu);
    }
    
    // ============================================
    // Prefetch Data to GPUs
    // ============================================
    cudaStream_t streams[NUM_GPUS];
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamCreate(&streams[gpu]);
        
        int offset = gpu * N_PER_GPU;
        cudaMemPrefetchAsync(data + offset, 
                             N_PER_GPU * sizeof(float),
                             gpu, streams[gpu]);
    }
    
    // ============================================
    // Launch Kernels
    // ============================================
    int blockSize = 256;
    int numBlocks = (N_PER_GPU + blockSize - 1) / blockSize;
    
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int offset = gpu * N_PER_GPU;
        
        process<<<numBlocks, blockSize, 0, streams[gpu]>>>(
            data, offset, N_PER_GPU);
    }
    
    // ============================================
    // Prefetch Back to Host
    // ============================================
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        int offset = gpu * N_PER_GPU;
        cudaMemPrefetchAsync(data + offset,
                             N_PER_GPU * sizeof(float),
                             cudaCpuDeviceId, streams[gpu]);
    }
    
    // Sync all
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamSynchronize(streams[gpu]);
    }
    
    // Verify on host
    printf("data[0] = %.1f (expected 2.0)\n", data[0]);
    printf("data[%d] = %.1f (expected 2.0)\n", N_PER_GPU, data[N_PER_GPU]);
    
    // Cleanup
    for (int gpu = 0; gpu < NUM_GPUS; gpu++) {
        cudaSetDevice(gpu);
        cudaStreamDestroy(streams[gpu]);
    }
    cudaFree(data);
    
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o unified_multi_gpu unified_multi_gpu.cu
!./unified_multi_gpu

---

## üéØ Exercises

### üî∑ CUDA C++ Exercises (Primary)

In [None]:
%%writefile multi_gpu_basics_exercises.cu
// multi_gpu_basics_exercises.cu - Multi-GPU Basics Exercises
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define CHECK_CUDA(call) { \
    cudaError_t err = call; \
    if (err != cudaSuccess) { \
        printf("CUDA error %s:%d: %s\n", __FILE__, __LINE__, \
               cudaGetErrorString(err)); \
        exit(1); \
    } \
}

// ============================================
// Exercise 1: Find GPU with Most Memory
// ============================================
void findBestGPU() {
    printf("\n=== Exercise 1: Device Query - Find Best GPU ===\n");
    
    int deviceCount;
    CHECK_CUDA(cudaGetDeviceCount(&deviceCount));
    
    printf("Found %d GPU(s)\n\n", deviceCount);
    
    int bestDevice = 0;
    size_t maxMemory = 0;
    
    for (int i = 0; i < deviceCount; i++) {
        cudaDeviceProp prop;
        CHECK_CUDA(cudaGetDeviceProperties(&prop, i));
        
        printf("GPU %d: %s\n", i, prop.name);
        printf("  Compute Capability: %d.%d\n", prop.major, prop.minor);
        printf("  Total Memory: %.2f GB\n", prop.totalGlobalMem / (1024.0*1024.0*1024.0));
        printf("  SM Count: %d\n", prop.multiProcessorCount);
        printf("  Max Threads/Block: %d\n", prop.maxThreadsPerBlock);
        printf("  Memory Clock: %.0f MHz\n", prop.memoryClockRate / 1000.0);
        printf("  Memory Bus Width: %d bits\n", prop.memoryBusWidth);
        
        // Calculate theoretical bandwidth
        float bandwidth = 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6;
        printf("  Theoretical Bandwidth: %.1f GB/s\n\n", bandwidth);
        
        if (prop.totalGlobalMem > maxMemory) {
            maxMemory = prop.totalGlobalMem;
            bestDevice = i;
        }
    }
    
    cudaDeviceProp bestProp;
    CHECK_CUDA(cudaGetDeviceProperties(&bestProp, bestDevice));
    printf("‚úì Best GPU (most memory): GPU %d (%s) with %.2f GB\n",
           bestDevice, bestProp.name, maxMemory / (1024.0*1024.0*1024.0));
}

// ============================================
// Exercise 2: P2P Bandwidth Measurement
// ============================================
void measureP2PBandwidth() {
    printf("\n=== Exercise 2: P2P Bandwidth Measurement ===\n");
    
    int deviceCount;
    CHECK_CUDA(cudaGetDeviceCount(&deviceCount));
    
    if (deviceCount < 2) {
        printf("Need at least 2 GPUs for P2P test. Found: %d\n", deviceCount);
        printf("Simulating with single GPU (host staging)...\n\n");
        
        // Single GPU simulation via host
        const size_t size = 256 * 1024 * 1024;  // 256 MB
        const int iterations = 10;
        
        float *d_src, *d_dst;
        float *h_buffer;
        CHECK_CUDA(cudaMalloc(&d_src, size));
        CHECK_CUDA(cudaMalloc(&d_dst, size));
        CHECK_CUDA(cudaMallocHost(&h_buffer, size));  // Pinned memory
        
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        
        // Warm up
        CHECK_CUDA(cudaMemcpy(h_buffer, d_src, size, cudaMemcpyDeviceToHost));
        CHECK_CUDA(cudaMemcpy(d_dst, h_buffer, size, cudaMemcpyHostToDevice));
        
        cudaEventRecord(start);
        for (int i = 0; i < iterations; i++) {
            CHECK_CUDA(cudaMemcpy(d_dst, d_src, size, cudaMemcpyDeviceToDevice));
        }
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        
        float ms;
        cudaEventElapsedTime(&ms, start, stop);
        float bandwidth = (float)size * iterations / (ms / 1000.0) / (1024*1024*1024);
        
        printf("Device-to-Device copy (same GPU):\n");
        printf("  Size: %.0f MB\n", size / (1024.0 * 1024.0));
        printf("  Time: %.2f ms (%d iterations)\n", ms, iterations);
        printf("  Bandwidth: %.2f GB/s\n", bandwidth);
        
        cudaFree(d_src);
        cudaFree(d_dst);
        cudaFreeHost(h_buffer);
        return;
    }
    
    // Multi-GPU P2P test
    int gpu0 = 0, gpu1 = 1;
    
    // Check P2P capability
    int canAccess01, canAccess10;
    CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccess01, gpu0, gpu1));
    CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccess10, gpu1, gpu0));
    
    printf("P2P Access: GPU %d -> GPU %d: %s\n", 
           gpu0, gpu1, canAccess01 ? "YES" : "NO");
    printf("P2P Access: GPU %d -> GPU %d: %s\n", 
           gpu1, gpu0, canAccess10 ? "YES" : "NO");
    
    if (canAccess01 && canAccess10) {
        CHECK_CUDA(cudaSetDevice(gpu0));
        CHECK_CUDA(cudaDeviceEnablePeerAccess(gpu1, 0));
        CHECK_CUDA(cudaSetDevice(gpu1));
        CHECK_CUDA(cudaDeviceEnablePeerAccess(gpu0, 0));
        printf("P2P enabled between GPUs\n\n");
    }
    
    const size_t size = 256 * 1024 * 1024;  // 256 MB
    const int iterations = 10;
    
    // Allocate on each GPU
    float *d_gpu0, *d_gpu1;
    CHECK_CUDA(cudaSetDevice(gpu0));
    CHECK_CUDA(cudaMalloc(&d_gpu0, size));
    CHECK_CUDA(cudaSetDevice(gpu1));
    CHECK_CUDA(cudaMalloc(&d_gpu1, size));
    
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaSetDevice(gpu0));
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // Measure GPU0 -> GPU1
    cudaEventRecord(start);
    for (int i = 0; i < iterations; i++) {
        CHECK_CUDA(cudaMemcpyPeer(d_gpu1, gpu1, d_gpu0, gpu0, size));
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    float bw01 = (float)size * iterations / (ms / 1000.0) / (1024*1024*1024);
    
    printf("GPU %d -> GPU %d:\n", gpu0, gpu1);
    printf("  Bandwidth: %.2f GB/s\n", bw01);
    
    // Measure GPU1 -> GPU0
    cudaEventRecord(start);
    for (int i = 0; i < iterations; i++) {
        CHECK_CUDA(cudaMemcpyPeer(d_gpu0, gpu0, d_gpu1, gpu1, size));
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    cudaEventElapsedTime(&ms, start, stop);
    float bw10 = (float)size * iterations / (ms / 1000.0) / (1024*1024*1024);
    
    printf("GPU %d -> GPU %d:\n", gpu1, gpu0);
    printf("  Bandwidth: %.2f GB/s\n", bw10);
    printf("\nBidirectional: %.2f GB/s\n", bw01 + bw10);
    
    cudaFree(d_gpu0);
    cudaFree(d_gpu1);
}

// ============================================
// Exercise 3: Multi-GPU Vector Addition
// ============================================
__global__ void vectorAddKernel(float* a, float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

void multiGPUVectorAdd() {
    printf("\n=== Exercise 3: Multi-GPU Vector Addition ===\n");
    
    int deviceCount;
    CHECK_CUDA(cudaGetDeviceCount(&deviceCount));
    
    // Use up to 2 GPUs
    int numGPUs = (deviceCount >= 2) ? 2 : 1;
    printf("Using %d GPU(s)\n", numGPUs);
    
    const int N = 16 * 1024 * 1024;  // 16M elements
    const size_t size = N * sizeof(float);
    
    // Host arrays
    float *h_a, *h_b, *h_c;
    CHECK_CUDA(cudaMallocHost(&h_a, size));
    CHECK_CUDA(cudaMallocHost(&h_b, size));
    CHECK_CUDA(cudaMallocHost(&h_c, size));
    
    // Initialize
    for (int i = 0; i < N; i++) {
        h_a[i] = (float)i;
        h_b[i] = (float)(i * 2);
    }
    
    // Calculate chunk sizes
    int chunkSize = N / numGPUs;
    int remainder = N % numGPUs;
    
    // Device pointers and streams
    float *d_a[2], *d_b[2], *d_c[2];
    cudaStream_t streams[2];
    int offsets[2], sizes[2];
    
    // Setup each GPU
    for (int i = 0; i < numGPUs; i++) {
        offsets[i] = i * chunkSize;
        sizes[i] = chunkSize + (i == numGPUs - 1 ? remainder : 0);
        
        CHECK_CUDA(cudaSetDevice(i));
        CHECK_CUDA(cudaStreamCreate(&streams[i]));
        CHECK_CUDA(cudaMalloc(&d_a[i], sizes[i] * sizeof(float)));
        CHECK_CUDA(cudaMalloc(&d_b[i], sizes[i] * sizeof(float)));
        CHECK_CUDA(cudaMalloc(&d_c[i], sizes[i] * sizeof(float)));
    }
    
    cudaEvent_t start, stop;
    CHECK_CUDA(cudaSetDevice(0));
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    cudaEventRecord(start);
    
    // Copy to GPUs (async)
    for (int i = 0; i < numGPUs; i++) {
        CHECK_CUDA(cudaSetDevice(i));
        CHECK_CUDA(cudaMemcpyAsync(d_a[i], h_a + offsets[i], 
                                   sizes[i] * sizeof(float),
                                   cudaMemcpyHostToDevice, streams[i]));
        CHECK_CUDA(cudaMemcpyAsync(d_b[i], h_b + offsets[i],
                                   sizes[i] * sizeof(float),
                                   cudaMemcpyHostToDevice, streams[i]));
    }
    
    // Launch kernels
    int blockSize = 256;
    for (int i = 0; i < numGPUs; i++) {
        CHECK_CUDA(cudaSetDevice(i));
        int gridSize = (sizes[i] + blockSize - 1) / blockSize;
        vectorAddKernel<<<gridSize, blockSize, 0, streams[i]>>>(
            d_a[i], d_b[i], d_c[i], sizes[i]);
    }
    
    // Copy results back (async)
    for (int i = 0; i < numGPUs; i++) {
        CHECK_CUDA(cudaSetDevice(i));
        CHECK_CUDA(cudaMemcpyAsync(h_c + offsets[i], d_c[i],
                                   sizes[i] * sizeof(float),
                                   cudaMemcpyDeviceToHost, streams[i]));
    }
    
    // Synchronize all
    for (int i = 0; i < numGPUs; i++) {
        CHECK_CUDA(cudaSetDevice(i));
        CHECK_CUDA(cudaStreamSynchronize(streams[i]));
    }
    
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float ms;
    cudaEventElapsedTime(&ms, start, stop);
    
    // Verify
    bool correct = true;
    for (int i = 0; i < N; i++) {
        float expected = h_a[i] + h_b[i];
        if (fabs(h_c[i] - expected) > 1e-5) {
            printf("Mismatch at %d: %.2f vs %.2f\n", i, h_c[i], expected);
            correct = false;
            break;
        }
    }
    
    printf("\nVector size: %d elements (%.0f MB)\n", N, size / (1024.0 * 1024.0));
    for (int i = 0; i < numGPUs; i++) {
        printf("GPU %d processed: %d elements\n", i, sizes[i]);
    }
    printf("\nTotal time: %.3f ms\n", ms);
    printf("Throughput: %.2f GB/s\n", 3.0 * size / (ms / 1000.0) / (1024*1024*1024));
    printf("Result: %s\n", correct ? "CORRECT ‚úì" : "INCORRECT ‚úó");
    
    // Cleanup
    for (int i = 0; i < numGPUs; i++) {
        CHECK_CUDA(cudaSetDevice(i));
        cudaFree(d_a[i]);
        cudaFree(d_b[i]);
        cudaFree(d_c[i]);
        cudaStreamDestroy(streams[i]);
    }
    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    cudaFreeHost(h_c);
}

// ============================================
// Main
// ============================================
int main() {
    printf("‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó\n");
    printf("‚ïë            MULTI-GPU BASICS - EXERCISES                       ‚ïë\n");
    printf("‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù\n");
    
    findBestGPU();
    measureP2PBandwidth();
    multiGPUVectorAdd();
    
    printf("\n‚úì All exercises completed!\n");
    return 0;
}

In [None]:
!nvcc -arch=sm_75 -o multi_gpu_basics_exercises multi_gpu_basics_exercises.cu && ./multi_gpu_basics_exercises

### üî∂ Python/Numba Exercises (Optional)

### Exercise 1: Device Query
Write code to find the GPU with the most memory.

### Exercise 2: P2P Bandwidth
Measure P2P copy bandwidth between two GPUs.

### Exercise 3: Multi-GPU Vector Add
Implement vector addition split across 2 GPUs.

---

## Key Takeaways

```
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ                MULTI-GPU BASICS                         ‚îÇ
‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§
‚îÇ                                                         ‚îÇ
‚îÇ  Device Management:                                     ‚îÇ
‚îÇ  ‚Ä¢ cudaGetDeviceCount(&count)                           ‚îÇ
‚îÇ  ‚Ä¢ cudaSetDevice(id)                                    ‚îÇ
‚îÇ  ‚Ä¢ cudaGetDevice(&id)                                   ‚îÇ
‚îÇ                                                         ‚îÇ
‚îÇ  Peer Access:                                           ‚îÇ
‚îÇ  ‚Ä¢ cudaDeviceCanAccessPeer(&can, dev, peer)             ‚îÇ
‚îÇ  ‚Ä¢ cudaDeviceEnablePeerAccess(peer, 0)                  ‚îÇ
‚îÇ  ‚Ä¢ cudaMemcpyPeer(dst, dstDev, src, srcDev, size)       ‚îÇ
‚îÇ                                                         ‚îÇ
‚îÇ  Pattern:                                               ‚îÇ
‚îÇ  1. Query/select devices                                ‚îÇ
‚îÇ  2. Enable P2P if available                             ‚îÇ
‚îÇ  3. Allocate memory on each GPU                         ‚îÇ
‚îÇ  4. Distribute data                                     ‚îÇ
‚îÇ  5. Launch kernels                                      ‚îÇ
‚îÇ  6. Collect results                                     ‚îÇ
‚îÇ                                                         ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
```

## Next: Day 2 - Multi-GPU Patterns