## Pattern 1: GPU Health Checker

In [None]:
%%writefile gpu_health_check.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <nvml.h>

#define CHECK_CUDA(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            printf("CUDA error: %s\n", cudaGetErrorString(err)); \
            return false; \
        } \
    } while(0)

struct GPUHealth {
    int deviceId;
    bool cudaAccessible;
    bool memoryOk;
    bool computeOk;
    bool temperatureOk;
    bool eccOk;
    
    // Metrics
    size_t freeMemory;
    size_t totalMemory;
    unsigned int temperature;
    unsigned int powerUsage;
    unsigned int gpuUtilization;
};

__global__ void healthCheckKernel(int* result) {
    // Simple compute test
    int val = threadIdx.x + blockIdx.x * blockDim.x;
    val = val * 2 + 1;
    if (threadIdx.x == 0) {
        *result = val;
    }
}

bool checkGPUHealth(int deviceId, GPUHealth& health) {
    health.deviceId = deviceId;
    health.cudaAccessible = false;
    health.memoryOk = false;
    health.computeOk = false;
    health.temperatureOk = false;
    health.eccOk = true;  // Assume OK unless detected otherwise
    
    // Test CUDA accessibility
    CHECK_CUDA(cudaSetDevice(deviceId));
    health.cudaAccessible = true;
    
    // Memory check
    CHECK_CUDA(cudaMemGetInfo(&health.freeMemory, &health.totalMemory));
    health.memoryOk = (health.freeMemory > 0.1 * health.totalMemory);  // >10% free
    
    // Compute check
    int* d_result;
    int h_result = -1;
    CHECK_CUDA(cudaMalloc(&d_result, sizeof(int)));
    
    healthCheckKernel<<<1, 32>>>(d_result);
    CHECK_CUDA(cudaDeviceSynchronize());
    CHECK_CUDA(cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost));
    CHECK_CUDA(cudaFree(d_result));
    
    health.computeOk = (h_result == 1);  // Expected: 0 * 2 + 1 = 1
    
    // NVML checks
    nvmlDevice_t nvmlDevice;
    if (nvmlDeviceGetHandleByIndex(deviceId, &nvmlDevice) == NVML_SUCCESS) {
        // Temperature
        if (nvmlDeviceGetTemperature(nvmlDevice, NVML_TEMPERATURE_GPU, 
                                      &health.temperature) == NVML_SUCCESS) {
            health.temperatureOk = (health.temperature < 85);  // <85°C
        }
        
        // Power
        nvmlDeviceGetPowerUsage(nvmlDevice, &health.powerUsage);
        health.powerUsage /= 1000;  // Convert to watts
        
        // Utilization
        nvmlUtilization_t util;
        if (nvmlDeviceGetUtilizationRates(nvmlDevice, &util) == NVML_SUCCESS) {
            health.gpuUtilization = util.gpu;
        }
        
        // ECC errors
        unsigned long long eccErrors;
        if (nvmlDeviceGetTotalEccErrors(nvmlDevice, NVML_MEMORY_ERROR_TYPE_UNCORRECTED,
                                         NVML_VOLATILE_ECC, &eccErrors) == NVML_SUCCESS) {
            health.eccOk = (eccErrors == 0);
        }
    }
    
    return health.cudaAccessible && health.memoryOk && 
           health.computeOk && health.temperatureOk && health.eccOk;
}

void printHealth(const GPUHealth& h) {
    printf("GPU %d Health:\n", h.deviceId);
    printf("  CUDA Accessible: %s\n", h.cudaAccessible ? "OK" : "FAIL");
    printf("  Memory: %s (%.1f/%.1f GB free)\n", 
           h.memoryOk ? "OK" : "LOW",
           h.freeMemory / (1024.0*1024.0*1024.0),
           h.totalMemory / (1024.0*1024.0*1024.0));
    printf("  Compute: %s\n", h.computeOk ? "OK" : "FAIL");
    printf("  Temperature: %s (%u°C)\n", 
           h.temperatureOk ? "OK" : "HIGH", h.temperature);
    printf("  ECC: %s\n", h.eccOk ? "OK" : "ERRORS DETECTED");
    printf("  Power: %u W, Utilization: %u%%\n", 
           h.powerUsage, h.gpuUtilization);
}

int main() {
    printf("=== GPU Health Check ===\n\n");
    
    nvmlInit();
    
    int deviceCount;
    cudaGetDeviceCount(&deviceCount);
    
    int healthyGPUs = 0;
    for (int i = 0; i < deviceCount; i++) {
        GPUHealth health;
        bool isHealthy = checkGPUHealth(i, health);
        
        printHealth(health);
        printf("  Overall: %s\n\n", isHealthy ? "HEALTHY" : "UNHEALTHY");
        
        if (isHealthy) healthyGPUs++;
    }
    
    printf("=== Summary: %d/%d GPUs healthy ===\n", healthyGPUs, deviceCount);
    
    nvmlShutdown();
    return (healthyGPUs == deviceCount) ? 0 : 1;
}

In [None]:
!nvcc -O3 -arch=sm_80 gpu_health_check.cu -o gpu_health_check -lnvidia-ml && ./gpu_health_check

## Pattern 2: Graceful GPU Failover

In [None]:
%%writefile gpu_failover.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <vector>

class GPUManager {
public:
    struct GPUInfo {
        int id;
        bool available;
        size_t memory;
        int smCount;
    };
    
private:
    std::vector<GPUInfo> gpus_;
    int currentDevice_ = -1;
    
public:
    bool initialize() {
        int count;
        if (cudaGetDeviceCount(&count) != cudaSuccess) return false;
        
        for (int i = 0; i < count; i++) {
            GPUInfo info;
            info.id = i;
            info.available = true;
            
            cudaDeviceProp prop;
            if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
                info.memory = prop.totalGlobalMem;
                info.smCount = prop.multiProcessorCount;
            }
            
            gpus_.push_back(info);
        }
        
        return !gpus_.empty();
    }
    
    int selectBestGPU() {
        int best = -1;
        size_t bestMemory = 0;
        
        for (auto& gpu : gpus_) {
            if (!gpu.available) continue;
            
            // Check actual free memory
            cudaSetDevice(gpu.id);
            size_t free, total;
            if (cudaMemGetInfo(&free, &total) == cudaSuccess) {
                if (free > bestMemory) {
                    bestMemory = free;
                    best = gpu.id;
                }
            } else {
                gpu.available = false;  // Mark as unavailable
            }
        }
        
        return best;
    }
    
    bool setDevice(int deviceId) {
        if (cudaSetDevice(deviceId) != cudaSuccess) {
            markUnavailable(deviceId);
            return false;
        }
        currentDevice_ = deviceId;
        return true;
    }
    
    void markUnavailable(int deviceId) {
        for (auto& gpu : gpus_) {
            if (gpu.id == deviceId) {
                gpu.available = false;
                printf("[FAILOVER] GPU %d marked unavailable\n", deviceId);
                break;
            }
        }
    }
    
    int failover() {
        printf("[FAILOVER] Attempting failover from GPU %d\n", currentDevice_);
        
        // Mark current as unavailable
        if (currentDevice_ >= 0) {
            markUnavailable(currentDevice_);
        }
        
        // Reset device to clear any errors
        cudaDeviceReset();
        
        // Find next best GPU
        int next = selectBestGPU();
        if (next >= 0) {
            if (setDevice(next)) {
                printf("[FAILOVER] Switched to GPU %d\n", next);
                return next;
            }
        }
        
        printf("[FAILOVER] No available GPUs!\n");
        return -1;
    }
    
    void printStatus() {
        printf("\n=== GPU Status ===\n");
        for (const auto& gpu : gpus_) {
            printf("GPU %d: %s (%.1f GB, %d SMs)\n",
                   gpu.id, 
                   gpu.available ? "Available" : "Unavailable",
                   gpu.memory / (1024.0*1024.0*1024.0),
                   gpu.smCount);
        }
        printf("Current: GPU %d\n\n", currentDevice_);
    }
};

__global__ void workloadKernel(float* data, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        data[idx] = sqrtf((float)idx);
    }
}

bool executeWithFailover(GPUManager& mgr, float* d_data, int n) {
    const int maxRetries = 3;
    
    for (int retry = 0; retry < maxRetries; retry++) {
        int blockSize = 256;
        int numBlocks = (n + blockSize - 1) / blockSize;
        
        workloadKernel<<<numBlocks, blockSize>>>(d_data, n);
        
        cudaError_t err = cudaDeviceSynchronize();
        if (err == cudaSuccess) {
            return true;  // Success!
        }
        
        printf("[ERROR] Kernel failed: %s\n", cudaGetErrorString(err));
        
        // Attempt failover
        int newDevice = mgr.failover();
        if (newDevice < 0) {
            return false;  // No GPUs left
        }
        
        // Reallocate on new device
        cudaMalloc(&d_data, n * sizeof(float));
        printf("[RECOVERY] Retrying on GPU %d (attempt %d/%d)\n", 
               newDevice, retry + 2, maxRetries);
    }
    
    return false;
}

int main() {
    printf("=== GPU Failover Demo ===\n\n");
    
    GPUManager mgr;
    if (!mgr.initialize()) {
        printf("No GPUs found!\n");
        return 1;
    }
    
    mgr.printStatus();
    
    // Select best GPU
    int device = mgr.selectBestGPU();
    if (device < 0) {
        printf("No available GPUs!\n");
        return 1;
    }
    
    mgr.setDevice(device);
    printf("Selected GPU %d\n\n", device);
    
    // Allocate and run
    const int N = 1024 * 1024;
    float* d_data;
    cudaMalloc(&d_data, N * sizeof(float));
    
    printf("Executing workload...\n");
    if (executeWithFailover(mgr, d_data, N)) {
        printf("Workload completed successfully!\n");
    } else {
        printf("Workload failed after all retries!\n");
    }
    
    mgr.printStatus();
    cudaFree(d_data);
    
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 -std=c++14 gpu_failover.cu -o gpu_failover && ./gpu_failover

## Pattern 3: Resource RAII Wrappers

In [None]:
%%writefile cuda_raii.cu
#include <stdio.h>
#include <cuda_runtime.h>
#include <memory>
#include <stdexcept>

// RAII wrapper for device memory
template<typename T>
class DeviceBuffer {
private:
    T* ptr_ = nullptr;
    size_t size_ = 0;
    
public:
    DeviceBuffer() = default;
    
    explicit DeviceBuffer(size_t count) : size_(count) {
        if (cudaMalloc(&ptr_, count * sizeof(T)) != cudaSuccess) {
            throw std::runtime_error("cudaMalloc failed");
        }
    }
    
    ~DeviceBuffer() {
        if (ptr_) cudaFree(ptr_);
    }
    
    // Move only
    DeviceBuffer(DeviceBuffer&& other) : ptr_(other.ptr_), size_(other.size_) {
        other.ptr_ = nullptr;
        other.size_ = 0;
    }
    
    DeviceBuffer& operator=(DeviceBuffer&& other) {
        if (this != &other) {
            if (ptr_) cudaFree(ptr_);
            ptr_ = other.ptr_;
            size_ = other.size_;
            other.ptr_ = nullptr;
            other.size_ = 0;
        }
        return *this;
    }
    
    // No copy
    DeviceBuffer(const DeviceBuffer&) = delete;
    DeviceBuffer& operator=(const DeviceBuffer&) = delete;
    
    T* get() { return ptr_; }
    const T* get() const { return ptr_; }
    size_t size() const { return size_; }
    size_t bytes() const { return size_ * sizeof(T); }
    
    void copyFrom(const T* host) {
        cudaMemcpy(ptr_, host, bytes(), cudaMemcpyHostToDevice);
    }
    
    void copyTo(T* host) const {
        cudaMemcpy(host, ptr_, bytes(), cudaMemcpyDeviceToHost);
    }
};

// RAII wrapper for streams
class CudaStream {
private:
    cudaStream_t stream_ = nullptr;
    
public:
    CudaStream() {
        cudaStreamCreate(&stream_);
    }
    
    explicit CudaStream(unsigned int flags) {
        cudaStreamCreateWithFlags(&stream_, flags);
    }
    
    ~CudaStream() {
        if (stream_) cudaStreamDestroy(stream_);
    }
    
    CudaStream(CudaStream&& other) : stream_(other.stream_) {
        other.stream_ = nullptr;
    }
    
    cudaStream_t get() { return stream_; }
    operator cudaStream_t() { return stream_; }
    
    void synchronize() { cudaStreamSynchronize(stream_); }
    bool query() { return cudaStreamQuery(stream_) == cudaSuccess; }
};

// RAII wrapper for events
class CudaEvent {
private:
    cudaEvent_t event_ = nullptr;
    
public:
    CudaEvent() {
        cudaEventCreate(&event_);
    }
    
    explicit CudaEvent(unsigned int flags) {
        cudaEventCreateWithFlags(&event_, flags);
    }
    
    ~CudaEvent() {
        if (event_) cudaEventDestroy(event_);
    }
    
    cudaEvent_t get() { return event_; }
    operator cudaEvent_t() { return event_; }
    
    void record(cudaStream_t stream = 0) {
        cudaEventRecord(event_, stream);
    }
    
    void synchronize() { cudaEventSynchronize(event_); }
    
    float elapsedMs(const CudaEvent& start) const {
        float ms;
        cudaEventElapsedTime(&ms, start.event_, event_);
        return ms;
    }
};

// Demo kernel
__global__ void addKernel(float* c, const float* a, const float* b, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) c[idx] = a[idx] + b[idx];
}

int main() {
    printf("=== CUDA RAII Wrappers Demo ===\n\n");
    
    const int N = 1024 * 1024;
    
    try {
        // Automatic cleanup on scope exit!
        DeviceBuffer<float> d_a(N);
        DeviceBuffer<float> d_b(N);
        DeviceBuffer<float> d_c(N);
        
        printf("Allocated 3 buffers: %.2f MB each\n", 
               d_a.bytes() / (1024.0*1024.0));
        
        // Prepare host data
        std::unique_ptr<float[]> h_a(new float[N]);
        std::unique_ptr<float[]> h_b(new float[N]);
        std::unique_ptr<float[]> h_c(new float[N]);
        
        for (int i = 0; i < N; i++) {
            h_a[i] = 1.0f;
            h_b[i] = 2.0f;
        }
        
        d_a.copyFrom(h_a.get());
        d_b.copyFrom(h_b.get());
        
        // Timing with RAII events
        CudaEvent start, stop;
        CudaStream stream;
        
        start.record(stream);
        
        int blockSize = 256;
        int numBlocks = (N + blockSize - 1) / blockSize;
        addKernel<<<numBlocks, blockSize, 0, stream>>>(d_c.get(), d_a.get(), d_b.get(), N);
        
        stop.record(stream);
        stream.synchronize();
        
        printf("Kernel time: %.3f ms\n", stop.elapsedMs(start));
        
        // Verify
        d_c.copyTo(h_c.get());
        bool correct = (h_c[0] == 3.0f && h_c[N-1] == 3.0f);
        printf("Result: %s\n", correct ? "CORRECT" : "INCORRECT");
        
    } catch (const std::exception& e) {
        printf("Error: %s\n", e.what());
        return 1;
    }
    
    // All resources automatically freed here!
    printf("\nResources automatically cleaned up.\n");
    
    return 0;
}

In [None]:
!nvcc -O3 -arch=sm_80 -std=c++14 cuda_raii.cu -o cuda_raii && ./cuda_raii

## Production Deployment Checklist

### Pre-Deployment
- [ ] All error checks in place
- [ ] Memory leak testing complete
- [ ] Stress testing passed
- [ ] Logging configured
- [ ] Metrics collection ready

### Runtime
- [ ] GPU health monitoring
- [ ] Memory usage alerts
- [ ] Temperature thresholds
- [ ] ECC error tracking
- [ ] Failover procedures

### Recovery
- [ ] Sticky error handling
- [ ] Device reset procedures
- [ ] State checkpointing
- [ ] Graceful degradation

## Key Takeaways

1. **Always monitor GPU health** - Temperature, memory, ECC
2. **Implement failover** - Handle GPU failures gracefully
3. **Use RAII wrappers** - Automatic resource cleanup
4. **Log everything** - Errors, warnings, metrics
5. **Test error paths** - Verify recovery procedures work